<a href="https://colab.research.google.com/github/VictorHugoMartins/israel_x_palestine_data_analysis/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pré-Processamento de Dados

Este arquivo prepara os dados para as fases posteriores.


# Importação de Dados

In [1]:
from google.colab import drive
drive.mount('/content/drive')

base_path = '/content/drive/My Drive/Mestrado'

country = 'palestine'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import os

def read_csv_batches_from_dir(directory):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    df_list = []

    for file in all_files:
        print(f"Lendo arquivo: {file}")
        df = pd.read_csv(file)
        df_list.append(df)

    # Concatenar todos os DataFrames em um só
    final_df = pd.concat(df_list, ignore_index=True)
    return final_df

def import_data(country):
  # Caminho base no Google Drive

  # Diretórios contendo os arquivos CSV de comentários, vídeos e canais
  comments_dir = os.path.join(base_path, '{country}/comments_csv_batches'.format(country=country))
  videos_dir = os.path.join(base_path, '{country}/videos_csv_batches'.format(country=country))
  channels_dir = os.path.join(base_path, '{country}/channels_csv_batches'.format(country=country))

  # Ler todos os arquivos CSV dos diretórios
  df_comments = read_csv_batches_from_dir(comments_dir)
  df_videos = read_csv_batches_from_dir(videos_dir)
  df_channels = read_csv_batches_from_dir(channels_dir)

  return df_comments, df_videos, df_channels

palestine_df_comments, df_videos, df_channels = import_data('palestine')

original_comments = palestine_df_comments

Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_1.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_4.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_5.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_2.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_3.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_7.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_6.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_8.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_11.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_12.csv
Lendo arquivo: /content/drive/My Drive/Mestrado/palestine/comments_csv_batches/lote_9.csv
Lendo ar

## Dados Originais

### Comments

In [3]:
palestine_df_comments

Unnamed: 0,video_id,comment_id,author,author_profile_image_url,author_channel_url,author_channel_id,comment,published_at,updated_at,like_count,viewer_rating,can_rate,is_reply,parent_id,channel_id
0,rai_sj91Mts,Ugw9jGAzIj7ZEd3ctsJ4AaABAg.A9SXu5XETuNA9_YDq1UL2d,@ehitaredestiny3146,https://yt3.ggpht.com/DE8gbtNTtEb2oI-PKjR2Gw6k...,http://www.youtube.com/@ehitaredestiny3146,UCqC4Hff0ccqdyW9bIQzGSXQ,@@1nadiald what a beautiful liar! Maybe y’all ...,2024-10-14T15:41:23Z,2024-10-14T15:41:23Z,0,none,True,True,Ugw9jGAzIj7ZEd3ctsJ4AaABAg,UCatt7TBjfBkiJWx8khav_Gg
1,KnIUzxhDLXQ,UgzipCDxCGKZdbENemV4AaABAg,@darwisojol,https://yt3.ggpht.com/ytc/AIdro_njGmxKfoPrBMaI...,http://www.youtube.com/@darwisojol,UCSlryZdcLthIgB_uYpTgEUA,😂😂😂😂😂😂,2024-10-14T15:38:25Z,2024-10-14T15:38:25Z,0,none,True,False,,UC7fWeaHhqgM4Ry-RMpM2YYw
2,rai_sj91Mts,Ugw9jGAzIj7ZEd3ctsJ4AaABAg.A9SXu5XETuNA9_XoU4LceB,@1nadiald,https://yt3.ggpht.com/ytc/AIdro_nAc7gln-BI6Q9p...,http://www.youtube.com/@1nadiald,UCQxFdS-yyMmdd5LnXRG-FoQ,What a Dumas take!,2024-10-14T15:37:47Z,2024-10-14T15:37:47Z,0,none,True,True,Ugw9jGAzIj7ZEd3ctsJ4AaABAg,UCatt7TBjfBkiJWx8khav_Gg
3,lYXzMnN7SFg,UgzQrdFWVWfAQ8zcU3B4AaABAg,@andrysabatini,https://yt3.ggpht.com/6x1qe70o20YWrVdsYCEfDWfy...,http://www.youtube.com/@andrysabatini,UC-LNuL326djA50s-0PRq1Lw,uomo dispensatore di morte...,2024-10-14T15:35:50Z,2024-10-14T15:35:50Z,0,none,True,False,,UCef1-8eOpJgud7szVPlZQAQ
4,IpA8TU9brro,Ugz3S4TxqoQMvXcJ4aZ4AaABAg.A9WTlNPItFlA9_XLKZ1W5O,@1312Mork2,https://yt3.ggpht.com/oSPCnt4WtvcVNqVW4Zvcty5q...,http://www.youtube.com/@1312Mork2,UC6VNPUQL3ltQsJ8bUfRswiQ,You need to stop doing thought crime,2024-10-14T15:33:40Z,2024-10-14T15:33:40Z,0,none,True,True,Ugz3S4TxqoQMvXcJ4aZ4AaABAg,UCckHqySbfy5FcPP6MD_S-Yg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814970,d8AP7rQvV6Y,Ugw51FdhTUWk9lGjRTd4AaABAg,@faizahfaizah8255,https://yt3.ggpht.com/ytc/AIdro_nFJwocGV8DNqOb...,http://www.youtube.com/@faizahfaizah8255,UC3XRFOdxYT0MHUaINaosDww,Ber arti udah mulai jatuh alhamdulillah😊,2024-09-17T05:54:33Z,2024-09-17T05:54:33Z,128,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
814971,d8AP7rQvV6Y,UgwNzQ7P-35_ZPkw-MZ4AaABAg,@lukmanjourney641,https://yt3.ggpht.com/KLiMwgezEoPzLeuspnu2mMm_...,http://www.youtube.com/@lukmanjourney641,UCn3ZkgKF-r_QYzp8RiLCETQ,Pertanda idf sudah mau punah dihajar hamas mak...,2024-09-17T05:53:14Z,2024-09-17T05:53:14Z,149,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
814972,d8AP7rQvV6Y,UgyFeWKQDfg6KgAOorx4AaABAg,@Saskoloep,https://yt3.ggpht.com/ytc/AIdro_n-ipYGNyWfz12O...,http://www.youtube.com/@Saskoloep,UCRRCgLlbKDst9fFjEpfRP_A,"Strategi jitu, kalau tidak mau dikembalikan ke...",2024-09-17T05:45:38Z,2024-09-17T05:45:38Z,11,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
814973,d8AP7rQvV6Y,Ugyi8qwzcz7Ktvg42A94AaABAg,@faizalarffin2482,https://yt3.ggpht.com/ytc/AIdro_kdKHEaF6TdB4sb...,http://www.youtube.com/@faizalarffin2482,UCsgog3YohGNKrJOhD78RpUQ,"Tak payah lah Nak menipu lagi, lagipun negara ...",2024-09-17T05:43:54Z,2024-09-17T05:43:54Z,0,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg


In [4]:
palestine_df_comments.describe()

Unnamed: 0,like_count
count,814975.0
mean,6.592394
std,95.21846
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,23049.0


### Vídeos

In [5]:
df_videos

Unnamed: 0,video_id,title,description,channel_id,published_at,category_id,tags,view_count,like_count,comment_count,...,scheduled_end_time,concurrent_viewers,active_live_chat_id,recording_date,topicCategories,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
0,Q_7QIgSANlg,Israeli strike in Aitou in northern Lebanon ki...,The Lebanese Red Cross said 18 people were kil...,UC16niRr50-MSBwiO3YDb3RA,2024-10-14T15:43:19Z,25,"['bbc', 'bbc news', 'news', 'world news', 'bre...",7860,158,113,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
1,bEzEBz7d1Z8,Israel Vs Hezbollah War LIVE | Inside Israel's...,Israel Vs Hezbollah War LIVE | Inside Israel's...,UCef1-8eOpJgud7szVPlZQAQ,2024-10-14T15:34:53Z,25,"['Israel Vs Hezbollah War', 'Israel Lebanon Wa...",9347,66,3,...,,0,,,"['https://en.wikipedia.org/wiki/Military', 'ht...",,0,0,0,
2,tmDjav-5dUI,Israeli air strike destroys historic mosque in...,Israel has destroyed an old mosque in southern...,UCR0fZh5SBxxMNYdg0VzRFkg,2024-10-14T14:38:56Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",7805,1179,342,...,,0,,,['https://en.wikipedia.org/wiki/Society'],,0,0,0,
3,NOLJdq4zWFs,Israeli shelling of Gaza school kills at least...,An Israeli attack on a school used to shelter ...,UC16niRr50-MSBwiO3YDb3RA,2024-10-14T14:34:39Z,25,"['bbc', 'bbc news', 'news', 'world news', 'bre...",7076,364,174,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
4,gTP3ug1DBs0,Israel's brutality towards Palestinians didn’t...,The brutality Israel has been unleashing on Pa...,UC7fWeaHhqgM4Ry-RMpM2YYw,2024-10-14T13:54:43Z,25,"['TRT world', 'trtworld', 'World News', 'break...",3866,1391,49,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,rGgnRV_Hvw0,‘Lawless world’ faced by Palestinians in the W...,“The fact that I have an Israeli passport and ...,UCR0fZh5SBxxMNYdg0VzRFkg,2024-09-15T14:32:18Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",7690,809,73,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
807,a64qJkg2Ac4,Hypersonic Missile Hits Israel | US and Israel...,#Israel #Yemen #Missile #PrashantDhawan #Prash...,UCrC8mOqJQpoB7NuIMKIS6rQ,2024-09-15T14:28:00Z,27,"['Gaza', 'Hamas', 'Houthi attacks on israel', ...",1871962,58048,4592,...,,0,,,['https://en.wikipedia.org/wiki/Society'],,0,0,0,
808,oVR7mgvISiA,Gaza Offensive: Israel Recruiting Asylum Seeke...,Israel is recruiting African asylum seekers to...,UC_xWTNsx7zA2uI0ydQikViA,2024-09-15T13:40:01Z,25,"['dawn news english', 'dawn news', 'dawn news ...",1003,19,5,...,,0,,,['https://en.wikipedia.org/wiki/Society'],,0,0,0,
809,N45Kdm-Ue-U,Iran’s 3-Way Proxy War Crushes IDF; Israel Scr...,A fresh report by Israeli media outlet Haaretz...,UCckHqySbfy5FcPP6MD_S-Yg,2024-09-15T12:10:00Z,25,"['times of india', 'toi', 'latest news', 'news...",21121,281,163,...,,0,,,"['https://en.wikipedia.org/wiki/Military', 'ht...",,0,0,0,


In [6]:
df_videos.describe()

Unnamed: 0,category_id,view_count,like_count,comment_count,scheduled_end_time,concurrent_viewers,active_live_chat_id,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
count,811.0,811.0,811.0,811.0,0.0,811.0,0.0,0.0,811.0,811.0,811.0,0.0
mean,24.890259,216068.5,5351.081381,1025.027127,,0.0,,,0.0,0.0,0.0,
std,1.803711,470547.5,14116.863401,2457.954095,,0.0,,,0.0,0.0,0.0,
min,1.0,4.0,0.0,1.0,,0.0,,,0.0,0.0,0.0,
25%,25.0,19977.5,448.5,99.0,,0.0,,,0.0,0.0,0.0,
50%,25.0,65835.0,1563.0,377.0,,0.0,,,0.0,0.0,0.0,
75%,25.0,219457.5,4485.5,1095.5,,0.0,,,0.0,0.0,0.0,
max,29.0,6836883.0,216578.0,35712.0,,0.0,,,0.0,0.0,0.0,


### Canais

In [7]:
df_channels

Unnamed: 0,channel_id,title,description,published_at,country,view_count,comment_count,subscriber_count,video_count,is_verified,keywords,profile_picture_url
0,UC3asD3ORk07XRPGC47hr-0w,GNC NEWS 360,🌍 Welcome to GNC – Global News Channel! 🌍\n\nA...,2024-09-22T02:57:31.675741Z,US,7096,0,27,20,False,,https://yt3.ggpht.com/rE7KFfBo-a7KVXDR2xtVoESi...
1,UCkmLoDQ8AVKwEsDSJtNKmug,WOLD NEWS,Subscribe for USA,2024-08-31T21:48:41.363607Z,US,372583,0,481,275,False,"trump noticia ""kamalla harris"" vote",https://yt3.ggpht.com/VZXxIUrbv9NoSwomQz_4sOlC...
2,UC1Dlsbku0-iCGc0rXgQpDHQ,Amaizing Facts,Target Subscribers 1 Million\n\nHello everyone...,2024-06-05T10:37:49.325123Z,PK,84711317,0,238000,412,False,short shorts animals viral edits israel palest...,https://yt3.ggpht.com/dCjKkhHRFEPvorpV1vhdD1XT...
3,UC3-n3-ScTQLQ_Gc2tNLJreg,Stakelbeck Tonight with Erick Stakelbeck,Welcome to the Stakelbeck Tonight with Erick S...,2024-05-03T21:23:42.240192Z,US,2423653,0,31300,70,False,"""erick stakelbeck"" ""the watchman with erick st...",https://yt3.ggpht.com/Oz0YlzbQAiPwkqA0vH1832AL...
4,UCF9LFWX5cdGHBg_FDm6tFrQ,Breezy Politics,Politics By The People.,2024-02-24T18:00:09.134775Z,US,101447822,0,206000,575,False,,https://yt3.ggpht.com/I5YmN4cKN0Vaon04Qs9hBbF2...
...,...,...,...,...,...,...,...,...,...,...,...,...
216,UCfmSignFWkk1lw4015hCyyQ,StandWithUs,Fighting Antisemitism and Supporting Israel Ar...,2006-06-06T22:44:08Z,IL,44427727,0,192000,1465,False,standwithus Israel Education activism pro-isra...,https://yt3.ggpht.com/ytc/AIdro_mh8eN8fDlZehoy...
217,UC16niRr50-MSBwiO3YDb3RA,BBC News,Welcome to the official BBC News YouTube chann...,2006-04-08T05:51:05Z,GB,5607586404,0,16800000,23220,False,BBC News Official YouTube Channel,https://yt3.ggpht.com/y_esGAQOhX4rTpWvrALErAJl...
218,UCHpw8xwDNhU9gdohEcJu4aA,The Guardian,"The Guardian brings you news, documentaries, a...",2006-02-15T03:22:40Z,GB,729761411,0,2290000,7250,False,guardian news film video environment documenta...,https://yt3.ggpht.com/P7QrZEnioCzOeKA2sWKyxvR5...
219,UCm7lHFkt2yB_WzL67aruVBQ,Hindustan Times,"Hindustan Times Videos bring you news, views a...",2005-11-16T02:41:16Z,IN,5607879981,0,7370000,69366,False,"""world news"" ""us news"" ""HT world"" ""Hindustan T...",https://yt3.ggpht.com/rxxycwwjFXuC-eQNBcklj4P-...


In [8]:
df_channels.describe()

Unnamed: 0,view_count,comment_count,subscriber_count,video_count
count,221.0,221.0,221.0,221.0
mean,1493047000.0,0.0,2641411.0,36474.914027
std,2928207000.0,0.0,3996751.0,72141.218975
min,7096.0,0.0,27.0,20.0
25%,28142960.0,0.0,152000.0,711.0
50%,271130400.0,0.0,965000.0,3043.0
75%,1490802000.0,0.0,3570000.0,37819.0
max,18126010000.0,0.0,22100000.0,610000.0


# Limpeza de Comentários

Redução do vocabulário dos comentários coletados. Vídeos e Canais não passam por esse processo por não serem o foco da pesquisa e servirem como bases auxiliares para o percurso principal.

## Remoção de Dados Duplicados

In [9]:
# Função para remover duplicatas e manter os registros mais recentes.
# Intuito: garantir que não foram coletados dados repetidos para diferentes países
def remove_duplicates(df, id_column, date_column):
    # Ordena por data de forma decrescente e remove duplicatas mantendo o mais recente
    df_sorted = df.sort_values(by=date_column, ascending=False)
    df_unique = df_sorted.drop_duplicates(subset=id_column, keep='first')
    return df_unique

In [10]:
# Gerar as estatísticas
palestine_df_comments = remove_duplicates(palestine_df_comments, 'comment_id', 'updated_at')

## Filtragem por idioma

Apenas comentários em inglês serão utilizados na pesquisa. Não haverá, por agora, uma abordagem multilanguage.

In [11]:
# 1. Instalar FastText
!pip install fasttext

# 2. Importar bibliotecas necessárias
import fasttext
import pandas as pd

# 3. Baixar o modelo de identificação de idiomas do FastText
# Use the raw file URL to download the binary model directly
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O lid.176.bin

--2024-11-05 17:23:15--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.128, 13.35.7.38, 13.35.7.50, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘lid.176.bin’


2024-11-05 17:23:16 (68.2 MB/s) - ‘lid.176.bin’ saved [131266198/131266198]



In [12]:
# Carregae o modelo de linguagem
def load_model():
    model_path = 'lid.176.bin'
    model = fasttext.load_model(model_path)
    return model

model = load_model()

# Função para identificar o idioma de um comentário
def detect_language(comment):
    # Remover quebras de linha
    if isinstance(comment, str):
      comment = comment.replace('\n', ' ').strip()  # Remove quebras de linha e espaços extras
      prediction = model.predict(comment)
      return prediction[0][0]  # Retornar o código do idioma
    else:
      return ''

def filter_by_english(df_comments, verbose=False):
  # Aplicar a função de detecção de idioma a cada comentário
  df_comments['language'] = df_comments['comment'].apply(detect_language)

  # Filtrar os comentários em inglês
  english_comments = df_comments[df_comments['language'] == '__label__en']

  if verbose:
    print(english_comments[['comment_id', 'comment']])

  return english_comments

In [13]:
palestine_df_comments = filter_by_english(palestine_df_comments, verbose=True)

                                               comment_id  \
789472                         UgyDIMGydtCmgB41-SN4AaABAg   
789473                         UgxUGpdHzAceyYbIWDN4AaABAg   
789474                         UgyPfhLChrSMH0hSopR4AaABAg   
789475                         UgzcyIzP8x-MpOea_7V4AaABAg   
759573  UgxUNu91s5_4wWdEJ4h4AaABAg.A8xFzWWQ0MSAAGvsgmW2l1   
...                                                   ...   
689599                         UgzaDFIViC98wmEqEdR4AaABAg   
689600                         Ugw2Jtx0n6qufNR4zQJ4AaABAg   
689601                         UgyHaEtt528FzLJe_6J4AaABAg   
689602                         UgzrRIcuLYaO8BfdeM54AaABAg   
689603                         UgxLKgbvmmwQxoNj3WF4AaABAg   

                                                  comment  
789472  it is said that the 1948 UN resolution justifi...  
789473  🛑the cease fire deal‼️Lebanon🟰Iran,, Syria🟰Rus...  
789474                        As they say, “do the math!”  
789475        Nice try B***

### Filtragem de Vídeos e Canais

Aqui, calculamos o nosso universo a quantidade de canais, vídeos e comentários em inglês. Suas novas quantidades são calculadas, assim como o número de usuários comentaristas.

In [14]:
# Filtrar vídeos que possuem channel_id em df_channels e video_id em df_comments
df_videos = df_videos[
    (df_videos['video_id'].isin(palestine_df_comments['video_id']))
]

df_videos

Unnamed: 0,video_id,title,description,channel_id,published_at,category_id,tags,view_count,like_count,comment_count,...,scheduled_end_time,concurrent_viewers,active_live_chat_id,recording_date,topicCategories,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
0,Q_7QIgSANlg,Israeli strike in Aitou in northern Lebanon ki...,The Lebanese Red Cross said 18 people were kil...,UC16niRr50-MSBwiO3YDb3RA,2024-10-14T15:43:19Z,25,"['bbc', 'bbc news', 'news', 'world news', 'bre...",7860,158,113,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
1,bEzEBz7d1Z8,Israel Vs Hezbollah War LIVE | Inside Israel's...,Israel Vs Hezbollah War LIVE | Inside Israel's...,UCef1-8eOpJgud7szVPlZQAQ,2024-10-14T15:34:53Z,25,"['Israel Vs Hezbollah War', 'Israel Lebanon Wa...",9347,66,3,...,,0,,,"['https://en.wikipedia.org/wiki/Military', 'ht...",,0,0,0,
2,tmDjav-5dUI,Israeli air strike destroys historic mosque in...,Israel has destroyed an old mosque in southern...,UCR0fZh5SBxxMNYdg0VzRFkg,2024-10-14T14:38:56Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",7805,1179,342,...,,0,,,['https://en.wikipedia.org/wiki/Society'],,0,0,0,
3,NOLJdq4zWFs,Israeli shelling of Gaza school kills at least...,An Israeli attack on a school used to shelter ...,UC16niRr50-MSBwiO3YDb3RA,2024-10-14T14:34:39Z,25,"['bbc', 'bbc news', 'news', 'world news', 'bre...",7076,364,174,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
4,gTP3ug1DBs0,Israel's brutality towards Palestinians didn’t...,The brutality Israel has been unleashing on Pa...,UC7fWeaHhqgM4Ry-RMpM2YYw,2024-10-14T13:54:43Z,25,"['TRT world', 'trtworld', 'World News', 'break...",3866,1391,49,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,rGgnRV_Hvw0,‘Lawless world’ faced by Palestinians in the W...,“The fact that I have an Israeli passport and ...,UCR0fZh5SBxxMNYdg0VzRFkg,2024-09-15T14:32:18Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",7690,809,73,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
807,a64qJkg2Ac4,Hypersonic Missile Hits Israel | US and Israel...,#Israel #Yemen #Missile #PrashantDhawan #Prash...,UCrC8mOqJQpoB7NuIMKIS6rQ,2024-09-15T14:28:00Z,27,"['Gaza', 'Hamas', 'Houthi attacks on israel', ...",1871962,58048,4592,...,,0,,,['https://en.wikipedia.org/wiki/Society'],,0,0,0,
808,oVR7mgvISiA,Gaza Offensive: Israel Recruiting Asylum Seeke...,Israel is recruiting African asylum seekers to...,UC_xWTNsx7zA2uI0ydQikViA,2024-09-15T13:40:01Z,25,"['dawn news english', 'dawn news', 'dawn news ...",1003,19,5,...,,0,,,['https://en.wikipedia.org/wiki/Society'],,0,0,0,
809,N45Kdm-Ue-U,Iran’s 3-Way Proxy War Crushes IDF; Israel Scr...,A fresh report by Israeli media outlet Haaretz...,UCckHqySbfy5FcPP6MD_S-Yg,2024-09-15T12:10:00Z,25,"['times of india', 'toi', 'latest news', 'news...",21121,281,163,...,,0,,,"['https://en.wikipedia.org/wiki/Military', 'ht...",,0,0,0,


In [15]:
df_channels = df_channels[
    (df_channels['channel_id'].isin(palestine_df_comments['channel_id'])) |
    (df_channels['channel_id'].isin(palestine_df_comments['author_channel_id']))
]

df_channels

Unnamed: 0,channel_id,title,description,published_at,country,view_count,comment_count,subscriber_count,video_count,is_verified,keywords,profile_picture_url
0,UC3asD3ORk07XRPGC47hr-0w,GNC NEWS 360,🌍 Welcome to GNC – Global News Channel! 🌍\n\nA...,2024-09-22T02:57:31.675741Z,US,7096,0,27,20,False,,https://yt3.ggpht.com/rE7KFfBo-a7KVXDR2xtVoESi...
1,UCkmLoDQ8AVKwEsDSJtNKmug,WOLD NEWS,Subscribe for USA,2024-08-31T21:48:41.363607Z,US,372583,0,481,275,False,"trump noticia ""kamalla harris"" vote",https://yt3.ggpht.com/VZXxIUrbv9NoSwomQz_4sOlC...
2,UC1Dlsbku0-iCGc0rXgQpDHQ,Amaizing Facts,Target Subscribers 1 Million\n\nHello everyone...,2024-06-05T10:37:49.325123Z,PK,84711317,0,238000,412,False,short shorts animals viral edits israel palest...,https://yt3.ggpht.com/dCjKkhHRFEPvorpV1vhdD1XT...
3,UC3-n3-ScTQLQ_Gc2tNLJreg,Stakelbeck Tonight with Erick Stakelbeck,Welcome to the Stakelbeck Tonight with Erick S...,2024-05-03T21:23:42.240192Z,US,2423653,0,31300,70,False,"""erick stakelbeck"" ""the watchman with erick st...",https://yt3.ggpht.com/Oz0YlzbQAiPwkqA0vH1832AL...
4,UCF9LFWX5cdGHBg_FDm6tFrQ,Breezy Politics,Politics By The People.,2024-02-24T18:00:09.134775Z,US,101447822,0,206000,575,False,,https://yt3.ggpht.com/I5YmN4cKN0Vaon04Qs9hBbF2...
...,...,...,...,...,...,...,...,...,...,...,...,...
216,UCfmSignFWkk1lw4015hCyyQ,StandWithUs,Fighting Antisemitism and Supporting Israel Ar...,2006-06-06T22:44:08Z,IL,44427727,0,192000,1465,False,standwithus Israel Education activism pro-isra...,https://yt3.ggpht.com/ytc/AIdro_mh8eN8fDlZehoy...
217,UC16niRr50-MSBwiO3YDb3RA,BBC News,Welcome to the official BBC News YouTube chann...,2006-04-08T05:51:05Z,GB,5607586404,0,16800000,23220,False,BBC News Official YouTube Channel,https://yt3.ggpht.com/y_esGAQOhX4rTpWvrALErAJl...
218,UCHpw8xwDNhU9gdohEcJu4aA,The Guardian,"The Guardian brings you news, documentaries, a...",2006-02-15T03:22:40Z,GB,729761411,0,2290000,7250,False,guardian news film video environment documenta...,https://yt3.ggpht.com/P7QrZEnioCzOeKA2sWKyxvR5...
219,UCm7lHFkt2yB_WzL67aruVBQ,Hindustan Times,"Hindustan Times Videos bring you news, views a...",2005-11-16T02:41:16Z,IN,5607879981,0,7370000,69366,False,"""world news"" ""us news"" ""HT world"" ""Hindustan T...",https://yt3.ggpht.com/rxxycwwjFXuC-eQNBcklj4P-...


## Download de Bases Pré-Treinadas

In [16]:
!pip install nltk

import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('rslp')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Limpeza Textual


In [17]:
!pip install ftfy
!pip install apyori

import csv
import re
import ftfy
from nltk.corpus import stopwords

import pandas as pd



In [18]:
# Função para extrair URLs
def extract_urls(comment):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.findall(url_pattern, comment)

# Função para extrair menções
def extract_mentions(comment):
    mention_pattern = r'@[\w]+'
    return re.findall(mention_pattern, comment)

# Aplicando as funções para criar as novas colunas
palestine_df_comments['urls'] = palestine_df_comments['comment'].apply(extract_urls)
palestine_df_comments['mentions'] = palestine_df_comments['comment'].apply(extract_mentions)

## Limpeza de texto

In [19]:
def clean_data(text):
    text = str(text)
    text = ftfy.fix_text(text)  # Corrige erros de codificação de text
    text = text.split("#")[0]
    text = text.split("https://")[0]
    text = re.sub(r'k{2,}', '', text) # Remover a sequência  de k's
    text = re.sub(r'\d+', '', text)  # Remove números
    text = re.sub(r'@\w+\s?', '', str(text))  # Remove o padrão "@algumusuario"
    text = re.sub(r'\bRT\b', '', text, flags=re.IGNORECASE)  # Remove a sequência "RT"
    text = re.sub(r'\s?:\s?', ' ', text)  # Remove o símbolo " : "
    text = re.sub(r'https://\S+', '', text)  # Remove trechos que começam com "https://" seguidos por qualquer sequência de caracteres não espaços em branco
    text = re.sub(r'http://\S+', '', text)  # Remove trechos que começam com "http://" seguidos por qualquer sequência de caracteres não espaços em branco
    text = re.sub(r'[^\w\s]|_+', '', text)  # Remove emojis
    text = re.sub(r'"', '', text)  # Remove aspas
    text = re.sub(r'[^\w\s]', '', text)  # Remove pontuação
    text = re.sub(r'mention', '', text) # Remover a palavra "mention"
    text = re.sub(r'馃', 'c', text) # Decodificar o caractere 'ç'
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Manter apenas palavras no nosso alfabeto

    return text

def clean_data_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ):
    df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(clean_data)
  if ( output_dir ):
    df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [20]:
palestine_df_comments = clean_data_and_save(palestine_df_comments)

## Remoção de StopWords


In [21]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if isinstance(text, str):
        filtered_text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
        return filtered_text
    else:
        return text

def remove_stopword_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(remove_stopwords)

  if ( output_dir ):df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [22]:
palestine_df_comments = remove_stopword_and_save(palestine_df_comments)

## Transformação de tweets em minúsculos e lematização


In [23]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Inicializar lematizador
lemmatizer = WordNetLemmatizer()

# Função para lematização
def lemmatize_text(text):
    words = text.split()  # Separar o texto em palavras
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def preprocess_lower(text):
    if isinstance(text, str):
        tokens = nltk.word_tokenize(text.lower(), language='english')
        text = ' '.join(tokens)
        text = lemmatize_text(text)
        return text
    else:
        return text

def preprocess_lower_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(preprocess_lower)

  if ( output_dir): df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [24]:
palestine_df_comments = preprocess_lower_and_save(palestine_df_comments)

## Remoção de linhas em branco##


In [25]:
def remove_empty_line_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): pd.read_csv(input_dir, encoding='utf-8')

  df.dropna(subset=['comment'], inplace=True)
  df = df[df['comment'].str.strip() != '']

  if ( output_dir ): df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [26]:
palestine_df_comments = remove_empty_line_and_save(palestine_df_comments, output_dir='{country}_finalData.csv'.format(country='palestine'))

## Resultado

In [28]:
import pandas as pd

def compare_dataframes(df1, df2, columns):
    """
    Compara dois DataFrames com base na quantidade de valores nulos, não nulos e valores únicos nas colunas especificadas.

    Parâmetros:
    - df1, df2: DataFrames a serem comparados.
    - columns: Lista de colunas a serem analisadas.

    Retorna:
    - Um dicionário com DataFrames para o resumo comparativo das métricas no formato especificado.
    """
    # Listas para armazenar os resultados de nulos, não nulos e valores únicos
    non_null_data = []
    null_data = []
    unique_data = []

    # Itera sobre as colunas para calcular métricas
    for col in columns:
        # Cálculo dos valores não nulos
        non_null_row = {
            "Coluna": col,
            "Original": df1[col].notna().sum(),
            "Após Tratamento": df2[col].notna().sum(),
            "Diferença": df1[col].notna().sum() - df2[col].notna().sum()
        }
        non_null_data.append(non_null_row)

        # Cálculo dos valores nulos
        null_row = {
            "Coluna": col,
            "Original": df1[col].isna().sum(),
            "Após Tratamento": df2[col].isna().sum(),
            "Diferença": df1[col].isna().sum() - df2[col].isna().sum()
        }
        null_data.append(null_row)

        # Cálculo dos valores únicos
        unique_row = {
            "Coluna": col,
            "Original": df1[col].nunique(),
            "Após Tratamento": df2[col].nunique(),
            "Diferença": df1[col].nunique() - df2[col].nunique()
        }
        unique_data.append(unique_row)

    # Converte as listas em DataFrames para os nulos, não nulos e valores únicos
    non_null_df = pd.DataFrame(non_null_data)
    null_df = pd.DataFrame(null_data)
    unique_df = pd.DataFrame(unique_data)

    # Retorna um dicionário com os DataFrames no formato desejado
    return {
        "Valores Não Nulos": non_null_df,
        "Valores Nulos": null_df,
        "Valores Únicos": unique_df
    }

# Exemplo de uso
# df1 e df2 são os seus DataFrames de entrada
columns = ['comment_id', 'author_channel_id', 'author', 'author_channel_url', 'video_id', 'channel_id']
resultado_comparacao = compare_dataframes(original_comments, palestine_df_comments, columns)

# Exibindo os resultados
print("=== Valores Não Nulos ===")
print(resultado_comparacao["Valores Não Nulos"])

print("\n=== Valores Nulos ===")
print(resultado_comparacao["Valores Nulos"])

print("\n=== Valores Únicos ===")
print(resultado_comparacao["Valores Únicos"])

=== Valores Não Nulos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id    814975           717667      97308
1   author_channel_id    814975           717667      97308
2              author    814303           717041      97262
3  author_channel_url    814975           717667      97308
4            video_id    814975           717667      97308
5          channel_id    814975           717667      97308

=== Valores Nulos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id         0                0          0
1   author_channel_id         0                0          0
2              author       672              626         46
3  author_channel_url         0                0          0
4            video_id         0                0          0
5          channel_id         0                0          0

=== Valores Únicos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id    81

## Remoção de Valores Nulos

É melhor excluir os comentários que não é possível identificar o comment_id, por poderem se tratar de valores repetidos. A mesma coisa para os usuários que não possuem user_channel_id preenchido, não sendo possível identifica-los posteriormente. A coluna author_channel_id foi identificada como a chave única de usuário porque, diferente dos campos author e author_channel_url, não podem ser modificadas dentro do sistema, podendo nos levar a crer que se tratam de usuários distintos quando não o são. Dentre as demais features, não temos valores nulos.

In [29]:
duplicates = palestine_df_comments.groupby('author_channel_id').agg({
    'author': lambda x: list(x),
    'author_channel_url': lambda x: list(x)
}).reset_index()

# Filtrando apenas aqueles que têm mais de um autor ou URL
duplicates = duplicates[(duplicates['author'].apply(lambda x: len(set(x))) > 1) |
                        (duplicates['author_channel_url'].apply(lambda x: len(set(x))) > 1)]

print("\n=== Valores de author_channel_id com mais de um valor de author ou author_channel_url ===")
for index, row in duplicates.iterrows():
    print(f"\nAuthor Channel ID: {row['author_channel_id']}")
    print(f"Autores: {', '.join(set(row['author']))}")
    print(f"URLs de Channel: {', '.join(set(row['author_channel_url']))}")


=== Valores de author_channel_id com mais de um valor de author ou author_channel_url ===

Author Channel ID: UC-5h4iunFKV_q038Y_O0rbQ
Autores: @troll3497, @Internet_God
URLs de Channel: http://www.youtube.com/@troll3497, http://www.youtube.com/@Internet_God

Author Channel ID: UC-FGO7XzxrqM1Ug0GokNM2A
Autores: @AMIkoroma, @Agrig-tu
URLs de Channel: http://www.youtube.com/@AMIkoroma, http://www.youtube.com/@Agrig-tu

Author Channel ID: UC-bx-EqZV0J7iUOtj4bDMNQ
Autores: @Illegal-y7m, @SoupNazi-x2o
URLs de Channel: http://www.youtube.com/@Illegal-y7m, http://www.youtube.com/@SoupNazi-x2o

Author Channel ID: UC0HxMvP-W23HSgiAzaTTEeQ
Autores: @severalwolves, @AwesometownUSA
URLs de Channel: http://www.youtube.com/@AwesometownUSA, http://www.youtube.com/@severalwolves

Author Channel ID: UC0ulPlEXSMycImSlDXyYtPw
Autores: @TomPeranic, @TomPistol
URLs de Channel: http://www.youtube.com/@TomPeranic, http://www.youtube.com/@TomPistol

Author Channel ID: UC1PDRiL7rxPyqnozs5vBMoQ
Autores: @Ujjai

In [30]:
# Remover registros com author_channel_id ou comment_id nulos
palestine_df_coments = palestine_df_comments.dropna(subset=['author_channel_id', 'comment_id'])

In [31]:
## Estatísticas Atualizadas

# df1 e df2 são os seus DataFrames de entrada
columns = ['comment_id', 'author_channel_id', 'author', 'author_channel_url', 'video_id', 'channel_id']
resultado_comparacao = compare_dataframes(original_comments, palestine_df_comments, columns)

# Exibindo os resultados
print("=== Valores Não Nulos ===")
print(resultado_comparacao["Valores Não Nulos"])

print("\n=== Valores Nulos ===")
print(resultado_comparacao["Valores Nulos"])

print("\n=== Valores Únicos ===")
print(resultado_comparacao["Valores Únicos"])

=== Valores Não Nulos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id    814975           717667      97308
1   author_channel_id    814975           717667      97308
2              author    814303           717041      97262
3  author_channel_url    814975           717667      97308
4            video_id    814975           717667      97308
5          channel_id    814975           717667      97308

=== Valores Nulos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id         0                0          0
1   author_channel_id         0                0          0
2              author       672              626         46
3  author_channel_url         0                0          0
4            video_id         0                0          0
5          channel_id         0                0          0

=== Valores Únicos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id    81

In [33]:
palestine_df_comments

Unnamed: 0,video_id,comment_id,author,author_profile_image_url,author_channel_url,author_channel_id,comment,published_at,updated_at,like_count,viewer_rating,can_rate,is_reply,parent_id,channel_id,language,urls,mentions
789472,XQGoaVvpm-c,UgyDIMGydtCmgB41-SN4AaABAg,@LinguaFonetica,https://yt3.ggpht.com/rInZmo_JEgl4nulr_4CGI3AX...,http://www.youtube.com/@LinguaFonetica,UCAwi3tah_hWiJeJkLb2DJbw,said un resolution justifies jewish state arab...,2024-10-31T23:13:50Z,2024-10-31T23:13:50Z,0,none,True,False,,UC1EasxeXGzoXJb2y3HTMsLA,__label__en,[],[]
789473,XQGoaVvpm-c,UgxUGpdHzAceyYbIWDN4AaABAg,@dsnitzer7123,https://yt3.ggpht.com/ytc/AIdro_nbnAzz17y0AjU8...,http://www.youtube.com/@dsnitzer7123,UCshMbwtdRAS8CnQVBoSk93w,cease fire deallebanoniran syriarussiaedomanyo...,2024-10-31T22:28:58Z,2024-10-31T22:28:58Z,0,none,True,False,,UC1EasxeXGzoXJb2y3HTMsLA,__label__en,[],[]
789474,XQGoaVvpm-c,UgyPfhLChrSMH0hSopR4AaABAg,@bravoRFId63,https://yt3.ggpht.com/ej1GuFl15ApadgYd_7yxo5tJ...,http://www.youtube.com/@bravoRFId63,UChdq_Q1KPuIoN5GjvRA-Ssw,say math,2024-10-31T22:15:26Z,2024-10-31T22:15:26Z,0,none,True,False,,UC1EasxeXGzoXJb2y3HTMsLA,__label__en,[],[]
789475,XQGoaVvpm-c,UgzcyIzP8x-MpOea_7V4AaABAg,@mirhaider8190,https://yt3.ggpht.com/ytc/AIdro_nbh-GLc1yW_xPy...,http://www.youtube.com/@mirhaider8190,UCxd5mGTlssjH4Vlt33wq2gw,nice try b palestine stolen land,2024-10-31T22:13:54Z,2024-10-31T22:13:54Z,0,none,True,False,,UC1EasxeXGzoXJb2y3HTMsLA,__label__en,[],[]
759573,rGpYoVsCTdg,UgxUNu91s5_4wWdEJ4h4AaABAg.A8xFzWWQ0MSAAGvsgmW2l1,@SonaharKhatun-o4y,https://yt3.ggpht.com/ytc/AIdro_mGCcO77qRAILee...,http://www.youtube.com/@SonaharKhatun-o4y,UCZ9loIfErRtAaXFa0ofxgNQ,mmdi whyjust telling truthdo humanitythe whole...,2024-10-31T21:23:24Z,2024-10-31T21:23:24Z,0,none,True,True,UgxUNu91s5_4wWdEJ4h4AaABAg,UCmTM_hPCeckqN3cPWtYZZcg,__label__en,[],[@Akna]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689599,rvlQIo60tV4,UgzaDFIViC98wmEqEdR4AaABAg,@notruthinthisworld.8414,https://yt3.ggpht.com/ytc/AIdro_nGeYh4-7xWgW6D...,http://www.youtube.com/@notruthinthisworld.8414,UCEzey9OLufC3Li0Q0ZmK1yQ,oops bad news idf,2024-09-15T05:23:58Z,2024-09-15T05:23:58Z,67,none,True,False,,UCm7lHFkt2yB_WzL67aruVBQ,__label__en,[],[]
689600,rvlQIo60tV4,Ugw2Jtx0n6qufNR4zQJ4AaABAg,@rx1-x,https://yt3.ggpht.com/DVNqhaFcJZ-MwLeOlStzCEYn...,http://www.youtube.com/@rx1-x,UCbJOpUIYAWZmt9kD0wbuxRw,repeat lie often enough eventually becomes israel,2024-09-15T05:22:11Z,2024-09-15T05:22:11Z,656,none,True,False,,UCm7lHFkt2yB_WzL67aruVBQ,__label__en,[],[]
689601,rvlQIo60tV4,UgyHaEtt528FzLJe_6J4AaABAg,@semprince,https://yt3.ggpht.com/ytc/AIdro_nlK0XZCHPihWVQ...,http://www.youtube.com/@semprince,UCAxYnY_CrgVYP5-jkRe4GqQ,fool making fool,2024-09-15T05:21:24Z,2024-09-15T05:21:24Z,177,none,True,False,,UCm7lHFkt2yB_WzL67aruVBQ,__label__en,[],[]
689602,rvlQIo60tV4,UgzrRIcuLYaO8BfdeM54AaABAg,@franciscahammerschlag4103,https://yt3.ggpht.com/ytc/AIdro_lMN55rIWAdDHgN...,http://www.youtube.com/@franciscahammerschlag4103,UCJ7CKcM1i6myQfVL8pY7Wkw,lie told idf tunnel gaza tunnel yet discovered...,2024-09-15T05:16:48Z,2024-09-15T05:16:48Z,4,none,True,False,,UCm7lHFkt2yB_WzL67aruVBQ,__label__en,[],[]


# Exportação de Dados

In [32]:
palestine_df_comments.to_csv('{bp}/{country}/{country}_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')
df_videos.to_csv('{bp}/{country}/{country}_videos_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')
df_channels.to_csv('{bp}/{country}/{country}_channels_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')