<a href="https://colab.research.google.com/github/VictorHugoMartins/israel_x_palestine_data_analysis/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pré-Processamento de Dados

Este arquivo prepara os dados para as fases posteriores.


# Importação de Dados

In [1]:
from google.colab import drive
drive.mount('/content/drive')

base_path = '/content/drive/My Drive/Mestrado'

country = 'palestine'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import os

def read_csv_batches_from_dir(directory):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    df_list = []
    num_files = len(all_files)  # Contador de arquivos processados

    # Lê cada arquivo e adiciona ao df_list
    for file in all_files:
        df = pd.read_csv(file)
        df_list.append(df)

    # Concatenar todos os DataFrames em um só
    final_df = pd.concat(df_list, ignore_index=True)

    # Imprime a quantidade de arquivos lidos para o diretório
    print(f"Lidos {num_files} arquivos CSV da pasta {directory}")

    return final_df

def import_data(country):
    # Diretórios contendo os arquivos CSV de comentários, vídeos e canais
    comments_dir = os.path.join(base_path, '{country}/comments_csv_batches'.format(country=country))
    videos_dir = os.path.join(base_path, '{country}/videos_csv_batches'.format(country=country))
    channels_dir = os.path.join(base_path, '{country}/channels_csv_batches'.format(country=country))

    # Ler todos os arquivos CSV dos diretórios
    df_comments = read_csv_batches_from_dir(comments_dir)
    df_videos = read_csv_batches_from_dir(videos_dir)
    df_channels = read_csv_batches_from_dir(channels_dir)

    return df_comments, df_videos, df_channels

# Exemplo de uso:
palestine_df_comments, df_videos, df_channels = import_data('palestine')

# Salva os dados originais
original_comments = palestine_df_comments

Lidos 186 arquivos CSV da pasta /content/drive/My Drive/Mestrado/palestine/comments_csv_batches
Lidos 1 arquivos CSV da pasta /content/drive/My Drive/Mestrado/palestine/videos_csv_batches
Lidos 1 arquivos CSV da pasta /content/drive/My Drive/Mestrado/palestine/channels_csv_batches


## Dados Originais

### Comments

In [3]:
palestine_df_comments

Unnamed: 0,video_id,comment_id,author,author_profile_image_url,author_channel_url,author_channel_id,comment,published_at,updated_at,like_count,viewer_rating,can_rate,is_reply,parent_id,channel_id
0,Vf5MThSniiY,UgxAlOKUSIYR4zzO5Dh4AaABAg.AAM6Ebg-s4jAAMAxwhHu7i,@jmjfanss,https://yt3.ggpht.com/x2t43hSZiQv9n5SFIByZDJSd...,http://www.youtube.com/@jmjfanss,UCY1_6lo-Yjw95MFvAkIoiwA,"I hear you, that's why I stopped voting after ...",2024-11-02T22:20:07Z,2024-11-02T22:20:07Z,1,none,True,True,UgxAlOKUSIYR4zzO5Dh4AaABAg,UCH1dpzjCEiGAt8CXkryhkZg
1,Vf5MThSniiY,Ugz1WuGNXBEdkIAk7nF4AaABAg,@jesusjaviergarcia,https://yt3.ggpht.com/ytc/AIdro_nI5jsz-0VV4h0y...,http://www.youtube.com/@jesusjaviergarcia,UCMZ6Jqx4HyXAE9-qgVUpAQA,Let the man Speak,2024-11-02T22:19:23Z,2024-11-02T22:19:23Z,0,none,True,False,,UCH1dpzjCEiGAt8CXkryhkZg
2,2SJomX0tw5E,UgzV-IIP1aPgEH0N8Rl4AaABAg.AAHRRN90CaVAAMAfqFqGyT,@anam.caballerowilson9421,https://yt3.ggpht.com/NYwbrC50fBnFHiB2RTrWHpPl...,http://www.youtube.com/@anam.caballerowilson9421,UCnGYqAfe-KVM0QxEkBeMbXA,What traitor? Have you watched Gladiator? he h...,2024-11-02T22:17:39Z,2024-11-02T22:17:39Z,0,none,True,True,UgzV-IIP1aPgEH0N8Rl4AaABAg,UCckHqySbfy5FcPP6MD_S-Yg
3,Vf5MThSniiY,UgwIo0q9vEPdyBqtCAh4AaABAg,@wardyra,https://yt3.ggpht.com/ytc/AIdro_kyx-poWpglC-tv...,http://www.youtube.com/@wardyra,UC4Yz5SaY8hHfqv71CdgnwYw,It's not good that this is a video you have to...,2024-11-02T22:15:25Z,2024-11-02T22:15:25Z,0,none,True,False,,UCH1dpzjCEiGAt8CXkryhkZg
4,YaNcow2MQA8,UgwkDideIRiMeFYOOWp4AaABAg.AAKlMjhd4PUAAMAF5RngF1,@FireAnt745,https://yt3.ggpht.com/ytc/AIdro_nj52r54LP66EYF...,http://www.youtube.com/@FireAnt745,UCmc7w-9RGbzt757OTbGxAsw,"@@cryptoreport8762 Zionism is not Judaism, it'...",2024-11-02T22:13:51Z,2024-11-02T22:13:51Z,0,none,True,True,UgwkDideIRiMeFYOOWp4AaABAg,UC7fWeaHhqgM4Ry-RMpM2YYw
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1851762,EswmEJOyaFY,UgzstlvfmPXmTWr7P7F4AaABAg,@estanislaubabosoares1677,https://yt3.ggpht.com/ytc/AIdro_k9jCJGi8-Zs-Lb...,http://www.youtube.com/@estanislaubabosoares1677,UCaBYqFZyIi7M4UtvwWw6s1Q,Lanjut om BEN. Stop gencetan senjata. 💪,2024-07-15T02:08:37Z,2024-07-15T02:08:37Z,0,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
1851763,EswmEJOyaFY,UgwslOqRFx39fwZjRGh4AaABAg,@ahandaja5425,https://yt3.ggpht.com/ytc/AIdro_lO1bg78qyz0HR_...,http://www.youtube.com/@ahandaja5425,UC6I6pE2nQiBhEfjJ5HwFTlg,Berarti idf wajar saja menembaki warga palesti...,2024-07-15T01:38:26Z,2024-07-15T01:38:26Z,0,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
1851764,EswmEJOyaFY,UgyCpVrAR2Yk5jJFwoV4AaABAg,@Jeckahmed,https://yt3.ggpht.com/k10-YF12hZqsgUjJjNCsyGKF...,http://www.youtube.com/@Jeckahmed,UCx3JwNQeg_bFgYAWsUHj-iw,Alhamdulillah,2024-07-15T01:37:22Z,2024-07-15T01:37:22Z,33,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
1851765,EswmEJOyaFY,UgwLORvcKeOo2W9VbJd4AaABAg,@muhammadsafii8903,https://yt3.ggpht.com/ytc/AIdro_lPAu5LcUW1h_9K...,http://www.youtube.com/@muhammadsafii8903,UCfxI5fefq674l_Okk3VBTag,Alhamdulillah free palestine🇵🇸❤️,2024-07-15T01:31:47Z,2024-07-15T01:31:47Z,649,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg


In [4]:
palestine_df_comments.describe()

Unnamed: 0,like_count
count,1851767.0
mean,6.365636
std,119.9675
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,55571.0


### Vídeos

In [5]:
df_videos

Unnamed: 0,video_id,title,description,channel_id,published_at,category_id,tags,view_count,like_count,comment_count,...,scheduled_end_time,concurrent_viewers,active_live_chat_id,recording_date,topicCategories,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
0,5oiduodVVvs,Ehud Barak: Hamas’ October 7 attack exposed Is...,Former Israeli Prime Minister Ehud Barak said ...,UCR0fZh5SBxxMNYdg0VzRFkg,2024-11-03T14:14:48Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",2964,944,88,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
1,nXu-rS-W-6E,Muslim Crowd GOES SILENT as Bill Clinton revea...,Wool removed from Pres. Bill Clinton's eyes: J...,UC1EasxeXGzoXJb2y3HTMsLA,2024-11-03T13:52:26Z,25,"['Israel', 'War', 'Terror', 'Hamas', 'Jihadi',...",9138,3826,948,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
2,scDRf0Wskh0,Israel Hezbollah War LIVE | Israel Captures He...,Israel Hezbollah War LIVE | Israel Captures He...,UCef1-8eOpJgud7szVPlZQAQ,2024-11-03T12:35:09Z,25,"['news18', 'cnn news18', 'latest news', 'israe...",16534,197,8,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
3,0zM2pHTqsn0,‘Timing Left To…’ Iran Okays Israel Attack As ...,The United States Central Command announced th...,UC3prwMn9aU2z5Y158ZdGyyA,2024-11-03T12:19:42Z,25,"['iran israel war', 'israel iran war', 'iran i...",9233,759,383,...,,0,,,"['https://en.wikipedia.org/wiki/Military', 'ht...",,0,0,0,
4,uEF08Tq6A5o,What is Jabaliya refugee camp and why is Israe...,"With every war and Israeli assault on Gaza, Ja...",UCR0fZh5SBxxMNYdg0VzRFkg,2024-11-03T11:46:52Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",4029,1202,215,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2486,m44oIWP2A1c,#army #military #motivation #keşfet #israel #i...,#army #military #motivation #keşfet #israel #i...,UCvWb3kEo5D4KRgQa-JjceqA,2024-07-15T00:41:57Z,24,"['Army', 'Armyforce', 'Armylover', 'Art', 'Ani...",14312,0,9,...,,0,,,['https://en.wikipedia.org/wiki/Military'],,0,0,0,
2487,jrngJL4Iwo8,Israel Palestine Dress Up (Left or Right?) 🇮🇱🇵🇸,Ok so this is basically another one of THOSE C...,UCy6o8vCErPavp37zwSIGAJA,2024-07-15T00:33:35Z,27,[],13012,279,24,...,,0,,,[],,0,0,0,
2488,lFSr_JDUDQk,How to draw Palestine support Algeria and Bang...,How to draw Palestine support Algeria and Bang...,UChFExE69cWu7x-7FjuJDGiw,2024-07-15T00:30:20Z,28,"['How to draw Palestine flag drawing', 'Islami...",13585,631,9,...,,0,,,"['https://en.wikipedia.org/wiki/Hobby', 'https...",,0,0,0,
2489,rw-8j3U1W28,Roger Waters Criticizes UK Leaders' Support fo...,In a passionate conversation with Piers Morgan...,UCnA2ZZ_6P7DZbmUVEolAp3g,2024-07-15T00:01:26Z,22,[],11324,306,40,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,


In [6]:
df_videos.describe()

Unnamed: 0,category_id,view_count,like_count,comment_count,scheduled_end_time,concurrent_viewers,active_live_chat_id,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
count,2491.0,2491.0,2491.0,2491.0,0.0,2491.0,0.0,0.0,2491.0,2491.0,2491.0,0.0
mean,24.378563,168478.9,5012.417102,766.24448,,0.0,,,0.0,0.0,0.0,
std,3.012292,546599.8,22012.532446,2252.815496,,0.0,,,0.0,0.0,0.0,
min,1.0,1.0,0.0,1.0,,0.0,,,0.0,0.0,0.0,
25%,25.0,3834.0,96.0,9.0,,0.0,,,0.0,0.0,0.0,
50%,25.0,24420.0,649.0,99.0,,0.0,,,0.0,0.0,0.0,
75%,25.0,125813.5,3101.5,628.5,,0.0,,,0.0,0.0,0.0,
max,29.0,17358460.0,770902.0,42331.0,,0.0,,,0.0,0.0,0.0,


### Canais

In [7]:
df_channels

Unnamed: 0,channel_id,title,description,published_at,country,view_count,comment_count,subscriber_count,video_count,is_verified,keywords,profile_picture_url
0,UCkmLoDQ8AVKwEsDSJtNKmug,WOLD NEWS,Subscribe for USA,2024-08-31T21:48:41.363607Z,US,806251,0,912,373,False,"trump noticia ""kamalla harris"" vote",https://yt3.ggpht.com/VZXxIUrbv9NoSwomQz_4sOlC...
1,UCY3XuRwUrx0Tuyfxc9h-CKA,MuslimOdysseyalt,,2024-08-24T11:32:15.475473Z,,462,0,8,5,False,,https://yt3.ggpht.com/rnLjbQXTrKIW7cfHGt_rwqDV...
2,UCXBD5iG5cr4ZYZ99K-fmDHg,NDTV World,"NDTV World delivers a fresh, balanced, and inc...",2024-08-21T12:22:26.662914Z,IN,527475,0,2620,855,False,"""latest world news"" ""global news"" ""daily world...",https://yt3.ggpht.com/XPPea2c5PLsz1QmWmrYf5aSE...
3,UC-qF_LzQLDcjT2hiaz1Fs8g,Monkeypox,Channel ini berencana untuk mengupload Ismael ...,2024-08-03T02:25:35.585393Z,ID,2422978,0,4430,878,False,,https://yt3.ggpht.com/ytc/AIdro_lkd0rxDNf5fMK_...
4,UCZo9arkS0Nn-mVVaVHdwBwA,Daily Motations,Welcome to DailyMotation ! 🚀 Dive into a world...,2024-07-29T14:12:54.844809Z,DE,131345,0,510,119,False,,https://yt3.ggpht.com/AdDTFqDWIkdYwY7flduCS3p7...
...,...,...,...,...,...,...,...,...,...,...,...,...
831,UC1yBKRuGpC1tSM73A0ZjYjQ,The Young Turks,The Young Turks is the longest-running news pr...,2005-12-21T20:46:51Z,US,6913421734,0,6080000,65101,False,"news politics tyt ""young turks"" ""the young tur...",https://yt3.ggpht.com/f5y_0KmKhcvJI4fwO93TZYiv...
832,UCm7lHFkt2yB_WzL67aruVBQ,Hindustan Times,"Hindustan Times Videos bring you news, views a...",2005-11-16T02:41:16Z,IN,5669517181,0,7410000,70374,False,"""world news"" ""us news"" ""HT world"" ""Hindustan T...",https://yt3.ggpht.com/rxxycwwjFXuC-eQNBcklj4P-...
833,UC9LQwHZoucFT94I2h6JOcjw,Liverpool FC,Get closer to the Reds than anyone else!\n\nWe...,2005-10-23T01:19:05Z,GB,3031865702,0,10500000,7235,False,"Liverpool LFC ""Liverpool FC"" ""Liverpool Footba...",https://yt3.ggpht.com/XMb1CWW_li3OqjDsr6UyHdst...
834,UCupvZG-5ko_eiXAupbDfxWw,CNN,CNN is the world leader in news and informatio...,2005-10-02T16:06:36Z,,16867234203,0,17200000,167740,False,"CNN ""CNN News"" news ""breaking news""",https://yt3.ggpht.com/n5DRh94eycw6xGcOKTn6LKQw...


In [8]:
df_channels.describe()

Unnamed: 0,view_count,comment_count,subscriber_count,video_count
count,836.0,836.0,836.0,836.0
mean,643513600.0,0.0,1278458.0,16232.08134
std,2071057000.0,0.0,3543057.0,48133.011191
min,28.0,0.0,0.0,1.0
25%,321835.8,0.0,2512.5,192.75
50%,9390026.0,0.0,47300.0,698.0
75%,213898500.0,0.0,714500.0,4331.5
max,20199960000.0,0.0,38100000.0,611361.0


# Limpeza de Comentários

Redução do vocabulário dos comentários coletados. Vídeos e Canais não passam por esse processo por não serem o foco da pesquisa e servirem como bases auxiliares para o percurso principal.

## Remoção de Dados Duplicados

In [9]:
# Função para remover duplicatas e manter os registros mais recentes.
# Intuito: garantir que não foram coletados dados repetidos para diferentes países
def remove_duplicates(df, id_column, date_column):
    # Ordena por data de forma decrescente e remove duplicatas mantendo o mais recente
    df_sorted = df.sort_values(by=date_column, ascending=False)
    df_unique = df_sorted.drop_duplicates(subset=id_column, keep='first')
    return df_unique

In [10]:
# Gerar as estatísticas
palestine_df_comments = remove_duplicates(palestine_df_comments, 'comment_id', 'updated_at')

## Filtragem por idioma

Apenas comentários em inglês serão utilizados na pesquisa. Não haverá, por agora, uma abordagem multilanguage.

In [11]:
# 1. Instalar FastText
!pip install fasttext

# 2. Importar bibliotecas necessárias
import fasttext
import pandas as pd

# 3. Baixar o modelo de identificação de idiomas do FastText
# Use the raw file URL to download the binary model directly
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O lid.176.bin

--2024-11-24 23:53:46--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.244.202.62, 18.244.202.25, 18.244.202.103, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.244.202.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘lid.176.bin’


2024-11-24 23:53:47 (159 MB/s) - ‘lid.176.bin’ saved [131266198/131266198]



In [12]:
# Carregae o modelo de linguagem
def load_model():
    model_path = 'lid.176.bin'
    model = fasttext.load_model(model_path)
    return model

model = load_model()

# Função para identificar o idioma de um comentário
def detect_language(comment):
    # Remover quebras de linha
    if isinstance(comment, str):
      comment = comment.replace('\n', ' ').strip()  # Remove quebras de linha e espaços extras
      prediction = model.predict(comment)
      return prediction[0][0]  # Retornar o código do idioma
    else:
      return ''

def filter_by_english(df_comments, verbose=False):
  # Aplicar a função de detecção de idioma a cada comentário
  df_comments['language'] = df_comments['comment'].apply(detect_language)

  # Filtrar os comentários em inglês
  english_comments = df_comments[df_comments['language'] == '__label__en']

  if verbose:
    print(english_comments[['comment_id', 'comment']])

  return english_comments

In [13]:
palestine_df_comments = filter_by_english(palestine_df_comments, verbose=True)

                         comment_id  \
1837743  UgwciMkyTHL7zX5Bc9J4AaABAg   
1841768  UgwAEqmeCnfI7yLtiYF4AaABAg   
1841769  UgxP2KZ-jG2pMAuSa2F4AaABAg   
1841773  UgxKcWS-_u7lThB7gRp4AaABAg   
1841776  UgxJfWinU27jAG799NZ4AaABAg   
...                             ...   
1787923  UgwwKUl0XYJVEz0E-Yd4AaABAg   
1787924  UgwxbAfZN0JK6wahefl4AaABAg   
1777923  UgxuVj1oa7hFOO1E6XZ4AaABAg   
1777924  UgyCme9ns55AV0DxejF4AaABAg   
1807848  UgxW5KJPZvpwdVG282Z4AaABAg   

                                                   comment  
1837743    This is I'll ....the sample is pure master mind  
1841768  J.D.Vance is a good man. Hopefully , he will b...  
1841769  Give the stolen land back to the Palestinian p...  
1841773  While we can all love Trump, let's not forget ...  
1841776                                GOD. BLESS   MEXICO  
...                                                    ...  
1787923                  Thanks Sam so refreshing I’m home  
1787924                                    

### Filtragem de Vídeos e Canais

Aqui, calculamos o nosso universo a quantidade de canais, vídeos e comentários em inglês. Suas novas quantidades são calculadas, assim como o número de usuários comentaristas.

In [14]:
# # Filtrar vídeos que possuem channel_id em df_channels e video_id em df_comments
# df_videos = df_videos[
#     (df_videos['video_id'].isin(palestine_df_comments['video_id']))
# ]

# df_videos

In [15]:
# df_channels = df_channels[
#     (df_channels['channel_id'].isin(palestine_df_comments['channel_id'])) |
#     (df_channels['channel_id'].isin(palestine_df_comments['author_channel_id']))
# ]

# df_channels

## Filtragem por Keywords

In [16]:
import pandas as pd

# Lista de palavras-chave
key_words = [
    'Palestine', 'Israel', 'Zionism', 'Jew', 'Muslim',
    'West Bank', 'Gaza', 'middle east', 'Hamas',
    'Jerusalem', 'Two-state solution', 'IDF',
    'lebanon', 'yemen', 'syria', 'Palestinian', 'israli', 'Intifada'
]

# Normaliza a lista de palavras-chave para evitar problemas com maiúsculas/minúsculas
key_words_lower = [kw.lower() for kw in key_words]

# Tamanho inicial dos DataFrames
print("Número de vídeos antes do filtro:", len(df_videos))
print("Número de comentários antes do filtro:", len(palestine_df_comments))

# Filtra o DataFrame de vídeos
filtered_videos = df_videos[
    df_videos['title'].str.lower().str.contains('|'.join(key_words_lower))
]

# Mantém apenas os comentários associados aos vídeos filtrados
filtered_comments = palestine_df_comments[
    palestine_df_comments['video_id'].isin(filtered_videos['video_id'])
]

# Tamanho depois do filtro
print("Número de vídeos após o filtro:", len(filtered_videos))
print("Número de comentários após o filtro:", len(filtered_comments))

# Atualiza o DataFrame de comentários
palestine_df_comments = filtered_comments

# Títulos que não contêm as palavras-chave
excluded_videos = df_videos[~df_videos['title'].str.lower().str.contains('|'.join(key_words_lower))]

# Quantidade de comentários para cada título excluído
excluded_comments = palestine_df_comments[
    palestine_df_comments['video_id'].isin(excluded_videos['video_id'])
]

# Garante que os comentários excluídos sejam contados corretamente
if not excluded_comments.empty:
    excluded_comments_count = (
        excluded_comments
        .groupby('video_id')
        .size()
        .reset_index(name='comment_count')
    )
else:
    excluded_comments_count = pd.DataFrame(columns=['video_id', 'comment_count'])

# Adiciona os títulos ao DataFrame de contagem
excluded_videos_with_count = excluded_videos.merge(
    excluded_comments_count,
    on='video_id',
    how='left'
).fillna({'comment_count': 0})  # Preenche com 0 caso não haja comentários

# Converte comment_count para inteiro
# excluded_videos_with_count['comment_count'] = excluded_videos_with_count['comment_count'].astype(int)

# Exibe os resultados
print(excluded_videos_with_count[['title', 'tags']])

Número de vídeos antes do filtro: 2491
Número de comentários antes do filtro: 1637671
Número de vídeos após o filtro: 2463
Número de comentários após o filtro: 1623604
                                                title  \
0   Reporter Liam Cosgrove accuses US of abetting ...   
1   "Kamala Harris Could End The Genocide TODAY"- ...   
2   The Genocide Of Hindus in Bangladesh 🇧🇩‼️ #shorts   
3   Bangladesh: The Genocide Of Hindus You Haven't...   
4                       Zionist shreds genocide claim   
5   ‘Stop funding genocide’: activists storm Citib...   
6          Who is funding the Genocide in the Congo?!   
7              Amir Tsarfati: The Genocide Propaganda   
8   How rape sold a genocide: MEMO in Conversation...   
9   LAGOS NIGERIA TO DAKAR SENEGAL BY ROAD - EP1 (...   
10  Hezbollah Escalates Threats as Border Tensions...   
11  Top 7 most dangerous border in the World | Tam...   
12             Debunking the “Illegal Occupation” Lie   
13  Border hospitals in Kharkiv di

## Download de Bases Pré-Treinadas

In [17]:
!pip install nltk

import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('rslp')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Limpeza Textual


In [18]:
!pip install ftfy
!pip install apyori

import csv
import re
import ftfy
from nltk.corpus import stopwords

import pandas as pd



## Tratamento de colunas: Links, Mentions, Tópicos e Tags

In [20]:
import re

def extract_hashtags_from_comments(df, comment_column='comments'):
    """
    Extrai todas as hashtags dos comentários e adiciona uma nova coluna com as hashtags encontradas.

    Parameters:
        df (DataFrame): DataFrame contendo os comentários.
        comment_column (str): Nome da coluna que contém os comentários (padrão é 'comments').

    Returns:
        DataFrame: DataFrame com uma nova coluna 'hashtags', que contém as hashtags extraídas.
    """
    # Função para extrair as hashtags de um único comentário
    def extract_hashtags(comment):
        if isinstance(comment, str):
            return re.findall(r'#\w+', comment)  # Encontra todas as palavras que começam com '#' (hashtags)
        return []  # Retorna uma lista vazia se o comentário não for uma string

    # Aplica a função de extração de hashtags a cada comentário da coluna 'comments'
    df['hashtags'] = df[comment_column].apply(extract_hashtags)

    return df

# Exemplo de uso
palestine_df_comments = extract_hashtags_from_comments(palestine_df_comments, comment_column='comment')

# Verifique o resultado (primeiras 5 linhas)
print(palestine_df_comments[['comment', 'hashtags']].head())

                                                   comment hashtags
1841768  J.D.Vance is a good man. Hopefully , he will b...       []
1841769  Give the stolen land back to the Palestinian p...       []
1841773  While we can all love Trump, let's not forget ...     [#1]
1841776                                GOD. BLESS   MEXICO       []
1817752  Facts Muslims are now added to the stupid clas...       []


In [21]:
import re

# Função para extrair URLs
def extract_urls(comment):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.findall(url_pattern, comment)

# Função para extrair menções
def extract_mentions(comment):
    mention_pattern = r'@[\w]+'
    return re.findall(mention_pattern, comment)

# Aplicando as funções para criar as novas colunas
palestine_df_comments['urls'] = palestine_df_comments['comment'].apply(extract_urls)
palestine_df_comments['urls']

Unnamed: 0,urls
1841768,[]
1841769,[]
1841773,[]
1841776,[]
1817752,[]
...,...
1787923,[]
1787924,[]
1777923,[]
1777924,[]


In [22]:
palestine_df_comments['mentions'] = palestine_df_comments['comment'].apply(extract_mentions)
palestine_df_comments['mentions']

Unnamed: 0,mentions
1841768,[]
1841769,[]
1841773,[]
1841776,[]
1817752,[]
...,...
1787923,[]
1787924,[]
1777923,[]
1777924,[]


In [23]:
# Filtra as URLs não nulas e que não são listas vazias
non_empty_urls = palestine_df_comments[palestine_df_comments['urls'].apply(lambda x: isinstance(x, str) and x != '[]')]

# Imprime as primeiras 20 URLs não nulas e não vazias
print(non_empty_urls['urls'].head(20))

Series([], Name: urls, dtype: object)


In [24]:
import pandas as pd
import os
import re
import googleapiclient.discovery  # Para a API do YouTube
import googleapiclient.errors


# Função para extrair IDs de vídeos ou canais a partir das URLs
def extract_video_or_channel_id(url):
    """
    Extrai o ID de vídeo ou canal a partir da URL do YouTube.
    Para URLs de vídeo: "https://www.youtube.com/watch?v=<video_id>"
    Para URLs de canal: "https://www.youtube.com/channel/<channel_id>"

    Parameters:
        url (str): URL do vídeo ou canal no YouTube.

    Returns:
        str: ID extraído (vídeoId ou channel_id).
    """
    if not isinstance(url, str) or not url:  # Verifica se a URL é uma string não vazia
        return None, None

    video_id_pattern = r"v=([a-zA-Z0-9_-]+)"
    channel_id_pattern = r"youtube\.com\/channel\/([a-zA-Z0-9_-]+)"

    video_match = re.search(video_id_pattern, url)
    if video_match:
        return video_match.group(1), "video"

    channel_match = re.search(channel_id_pattern, url)
    if channel_match:
        return channel_match.group(1), "channel"

    return None, None

# Função para coletar informações da API do YouTube
def get_youtube_data(ids, data_type="video"):
    """
    Coleta dados da API do YouTube para os IDs fornecidos (vídeo ou canal).

    Parameters:
        ids (list): Lista de IDs de vídeos ou canais.
        data_type (str): Tipo de dado ("video" ou "channel").

    Returns:
        list: Lista de dicionários com as informações coletadas.
    """
    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey="YOUR_API_KEY")

    results = []
    if data_type == "video":
        for video_id in ids:
            request = youtube.videos().list(part="snippet,statistics", id=video_id)
            response = request.execute()
            for item in response.get('items', []):
                results.append({
                    'video_id': item['id'],
                    'title': item['snippet']['title'],
                    'description': item['snippet']['description'],
                    'viewCount': item['statistics'].get('viewCount', 'N/A')
                })
    elif data_type == "channel":
        for channel_id in ids:
            request = youtube.channels().list(part="snippet,statistics", id=channel_id)
            response = request.execute()
            for item in response.get('items', []):
                results.append({
                    'channel_id': item['id'],
                    'channelTitle': item['snippet']['title'],
                    'subscriberCount': item['statistics'].get('subscriberCount', 'N/A')
                })

    return results

In [25]:
import os
import re
import pandas as pd

def save_new_batch(data_df, directory_path, data_type="videos", existing_ids=None):
    """
    Adiciona as novas informações de vídeos ou canais em um novo arquivo de lote, incrementando o número do lote.
    A função verifica os dados já salvos (através dos IDs fornecidos) e só inclui os dados novos.

    Parameters:
        data_df (DataFrame): DataFrame com as novas informações dos vídeos ou canais.
        directory_path (str): Caminho para o diretório onde estão os arquivos de lote.
        data_type (str): Tipo de dado ("videos" ou "channels").
        existing_ids (list): Lista de IDs que já foram processados (para comparar com os dados novos).

    Returns:
        None
    """
    # Verifica se o data_df contém dados
    if data_df.empty:
        print(f"DataFrame vazio, nada para salvar em {data_type}.")
        return

    # Lista todos os arquivos no diretório
    existing_files = [f for f in os.listdir(directory_path) if re.match(r"lote_\d+_{}.csv".format(data_type), f)]

    # Extrai os números dos arquivos de lote existentes
    lote_numbers = [int(re.search(r"lote_(\d+)_{}.csv".format(data_type), f).group(1)) for f in existing_files]
    max_lote_number = max(lote_numbers) if lote_numbers else 0  # Define 0 caso não existam arquivos

    # Se existirem arquivos, carrega o último arquivo para verificar quais dados já foram salvos
    if max_lote_number > 0:
        last_lote_file = os.path.join(directory_path, f"lote_{max_lote_number}_{data_type}.csv")
        existing_data_df = pd.read_csv(last_lote_file)

        # Identifica o campo que é o identificador único (videoId para vídeos, channelId para canais)
        unique_id_column = 'videoId' if data_type == "videos" else 'channelId'

        # Obtém os IDs existentes, considerando apenas os dados anteriores
        existing_ids = set(existing_ids) | set(existing_data_df[unique_id_column].unique()) if existing_ids else set(existing_data_df[unique_id_column].unique())

    else:
        # Se não existirem arquivos anteriores, todos os dados são novos
        existing_ids = set(existing_ids) if existing_ids else set()

    # Filtra os dados novos que ainda não foram salvos
    unique_id_column = 'videoId' if data_type == "videos" else 'channelId'
    new_data_df = data_df[~data_df[unique_id_column].isin(existing_ids)]

    # Se houver dados novos, salva em um novo arquivo
    if not new_data_df.empty:
        new_lote_number = max_lote_number + 1
        new_file_name = f"lote_{new_lote_number}_{data_type}.csv"
        new_file_path = os.path.join(directory_path, new_file_name)

        # Salva o DataFrame com os novos dados no arquivo
        new_data_df.to_csv(new_file_path, index=False, encoding='utf-8')

        print(f"Novo arquivo de {data_type} criado com dados novos: {new_file_path}")
    else:
        print(f"Não há dados novos para salvar de {data_type}.")

# Passo 1: Extração dos IDs de vídeos e canais
palestine_df_comments['id_info'] = palestine_df_comments['urls'].apply(lambda url: extract_video_or_channel_id(url))

# Verifica os dados extraídos
print(palestine_df_comments['id_info'].head())

# Separando os IDs dos vídeos e canais
video_ids = []
channel_ids = []
for idx, row in palestine_df_comments.iterrows():
    if row['id_info'][1] == 'video':
        video_ids.append(row['id_info'][0])
    elif row['id_info'][1] == 'channel':
        channel_ids.append(row['id_info'][0])

# Passo 2: Coleta dos dados da API do YouTube
# Aqui é onde você chama a função da API do YouTube para coletar os dados dos vídeos e canais
video_data = get_youtube_data(video_ids, data_type="video")
channel_data = get_youtube_data(channel_ids, data_type="channel")

# Verifica os dados coletados
print(f"Dados dos vídeos coletados: {video_data[:5]}")
print(f"Dados dos canais coletados: {channel_data[:5]}")

# Convertendo as informações coletadas para DataFrames
df_video_data = pd.DataFrame(video_data)
df_channel_data = pd.DataFrame(channel_data)

# Verifica se os DataFrames não estão vazios
print(f"DataFrame de vídeos: {df_video_data.head()}")
print(f"DataFrame de canais: {df_channel_data.head()}")

# Passo 3: Salvamento dos dados novos em arquivos de lote
directory_path_videos = os.path.join(base_path, '{country}/videos_csv_batches'.format(country=country))
directory_path_channels = os.path.join(base_path, '{country}/channels_csv_batches'.format(country=country))

# IDs existentes
existing_video_ids = df_videos['video_id'].tolist()  # IDs de vídeo no df_videos
existing_channel_ids = df_channels['channel_id'].tolist()  # Apenas IDs de canal no df_channels

# Salvando dados novos
save_new_batch(df_video_data, directory_path_videos, data_type="videos", existing_ids=existing_video_ids)
save_new_batch(df_channel_data, directory_path_channels, data_type="channels", existing_ids=existing_channel_ids)

1841768    (None, None)
1841769    (None, None)
1841773    (None, None)
1841776    (None, None)
1817752    (None, None)
Name: id_info, dtype: object
Dados dos vídeos coletados: []
Dados dos canais coletados: []
DataFrame de vídeos: Empty DataFrame
Columns: []
Index: []
DataFrame de canais: Empty DataFrame
Columns: []
Index: []
DataFrame vazio, nada para salvar em videos.
DataFrame vazio, nada para salvar em channels.


In [26]:
import pandas as pd
import ast

# Função para transformar os elementos em lowercase e extrair parte relevante de URLs
def process_element(element):
    element = element.lower()  # Converte para minúsculas
    if "https://en.wikipedia.org/wiki/" in element:
        # Extrai o termo após '/wiki/' na URL
        return element.split('/')[-1]
    return element

# Função para pré-processar as colunas 'topicCategories' e 'tags'
def preprocess_columns(df_videos):
    # Processa a coluna 'topicCategories'
    if 'topicCategories' in df_videos.columns:
        df_videos['topicCategories'] = df_videos['topicCategories'].apply(
            lambda x: [process_element(item) for item in ast.literal_eval(x)] if isinstance(x, str) else []
        )

    # Processa a coluna 'tags'
    if 'tags' in df_videos.columns:
        df_videos['tags'] = df_videos['tags'].apply(
            lambda x: [process_element(item) for item in ast.literal_eval(x)] if isinstance(x, str) else []
        )

    return df_videos

# Exemplo de uso:
# Preprocessa as colunas 'topicCategories' e 'tags' do df_videos
df_videos = preprocess_columns(df_videos)

## Limpeza de texto

In [30]:
import re
import ftfy
import pandas as pd

# Função para limpar texto
def clean_data(text):
    text = str(text)
    text = ftfy.fix_text(text)  # Corrige erros de codificação de texto
    # text = text.split("https://")[0]  # Remove links iniciados com https://
    # text = text.split("http://")[0]  # Remove links iniciados com http://
    text = re.sub(r'k{2,}', '', text)  # Remove sequência de "kkk"
    text = re.sub(r'\d+', '', text)  # Remove números
    text = re.sub(r'@\w+\s?', '', text)  # Remove o padrão "@algumusuario"
    text = re.sub(r'\bRT\b', '', text, flags=re.IGNORECASE)  # Remove a sequência "RT"
    text = re.sub(r'\s?:\s?', ' ', text)  # Remove o símbolo " : "
    text = re.sub(r'[^\w\s#@]', '', text)  # Remove caracteres especiais, mantendo hashtags e menções
    text = re.sub(r'"', '', text)  # Remove aspas
    text = re.sub(r'mention', '', text)  # Remove a palavra "mention"
    return text

# Função para aplicar a limpeza ao DataFrame e salvar
def clean_data_and_save(df, input_dir=None, output_dir=None):
    if input_dir:
        df = pd.read_csv(input_dir, encoding='utf-8')
    df['comment'] = df['comment'].apply(clean_data)
    if output_dir:
        df.to_csv(output_dir, index=False, encoding='utf-8')
    return df

In [31]:
palestine_df_comments = clean_data_and_save(palestine_df_comments)

## Remoção de StopWords


In [32]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if isinstance(text, str):
        filtered_text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
        return filtered_text
    else:
        return text

def remove_stopword_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(remove_stopwords)

  if ( output_dir ):df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [33]:
palestine_df_comments = remove_stopword_and_save(palestine_df_comments)

## Transformação de tweets em minúsculos e lematização


In [34]:
nltk.download('punkt_tab')

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Inicializar lematizador
lemmatizer = WordNetLemmatizer()

# Função para lematização
def lemmatize_text(text):
    words = text.split()  # Separar o texto em palavras
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def preprocess_lower(text):
    if isinstance(text, str):
        tokens = nltk.word_tokenize(text.lower(), language='english')
        text = ' '.join(tokens)
        text = lemmatize_text(text)
        return text
    else:
        return text

def preprocess_lower_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(preprocess_lower)

  if ( output_dir): df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [35]:
palestine_df_comments = preprocess_lower_and_save(palestine_df_comments)

## Remoção de linhas em branco##


In [36]:
def remove_empty_line_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): pd.read_csv(input_dir, encoding='utf-8')

  df.dropna(subset=['comment'], inplace=True)
  df = df[df['comment'].str.strip() != '']

  if ( output_dir ): df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [37]:
palestine_df_comments = remove_empty_line_and_save(palestine_df_comments, output_dir='{country}_finalData.csv'.format(country='palestine'))

## Filtro por tamanho do comentário

In [38]:
palestine_df_comments = palestine_df_comments[palestine_df_comments['comment'].apply(lambda x: len(x) > 50)]

In [39]:
# Remover registros com author_channel_id ou comment_id nulos
palestine_df_coments = palestine_df_comments.dropna(subset=['author_channel_id', 'comment_id'])

## Remoção de Valores Nulos

É melhor excluir os comentários que não é possível identificar o comment_id, por poderem se tratar de valores repetidos. A mesma coisa para os usuários que não possuem user_channel_id preenchido, não sendo possível identifica-los posteriormente. A coluna author_channel_id foi identificada como a chave única de usuário porque, diferente dos campos author e author_channel_url, não podem ser modificadas dentro do sistema, podendo nos levar a crer que se tratam de usuários distintos quando não o são. Dentre as demais features, não temos valores nulos.

In [40]:
duplicates = palestine_df_comments.groupby('author_channel_id').agg({
    'author': lambda x: list(x),
    'author_channel_url': lambda x: list(x)
}).reset_index()

# Filtrando apenas aqueles que têm mais de um autor ou URL
duplicates = duplicates[(duplicates['author'].apply(lambda x: len(set(x))) > 1) |
                        (duplicates['author_channel_url'].apply(lambda x: len(set(x))) > 1)]

print("\n=== Valores de author_channel_id com mais de um valor de author ou author_channel_url ===")
for index, row in duplicates.iterrows():
    print(f"\nAuthor Channel ID: {row['author_channel_id']}")
    print(f"Autores: {', '.join(set(row['author']))}")
    print(f"URLs de Channel: {', '.join(set(row['author_channel_url']))}")


=== Valores de author_channel_id com mais de um valor de author ou author_channel_url ===

Author Channel ID: UC-RqsssaH_XTUIVco-YLa9g
Autores: @burakcoskun38, @burakcoskun1865
URLs de Channel: http://www.youtube.com/@burakcoskun1865, http://www.youtube.com/@burakcoskun38

Author Channel ID: UC07Ax-lQHEMIM7THzfjHKjg
Autores: @thedemigoddess333, @coldcoldheart2424
URLs de Channel: http://www.youtube.com/@thedemigoddess333, http://www.youtube.com/@coldcoldheart2424

Author Channel ID: UC2Sher9wGs0_-S7i4hHRHfA
Autores: @James_Barris, @Phuh_Queue
URLs de Channel: http://www.youtube.com/@Phuh_Queue, http://www.youtube.com/@James_Barris

Author Channel ID: UC32nhVmEAsCvtTk0MTOs1jA
Autores: @Nestln̈nnestle333, @Vivasabri22222
URLs de Channel: http://www.youtube.com/@Nestln%CC%88nnestle333, http://www.youtube.com/@Vivasabri22222

Author Channel ID: UC50ULlbhBSkbNKbXndwDHKA
Autores: @kamalalasucksbigly, @TrumpsYourDaddy
URLs de Channel: http://www.youtube.com/@kamalalasucksbigly, http://www.yo

## Resultado

In [41]:
palestine_df_comments

Unnamed: 0,video_id,comment_id,author,author_profile_image_url,author_channel_url,author_channel_id,comment,published_at,updated_at,like_count,viewer_rating,can_rate,is_reply,parent_id,channel_id,language,hashtags,urls,mentions,id_info
1841768,LJOGf-LgIcA,UgwAEqmeCnfI7yLtiYF4AaABAg,@IvorFreedman,https://yt3.ggpht.com/ytc/AIdro_nbIRpRoYqRgJ2s...,http://www.youtube.com/@IvorFreedman,UCFQBD7BPihZStX2Rp_r7y2Q,jdvance good man hopefully future president on...,2024-11-09T10:12:55Z,2024-11-09T10:12:55Z,0,none,True,False,,UC1EasxeXGzoXJb2y3HTMsLA,__label__en,[],[],[],"(None, None)"
1841769,RF-vcOk6tug,UgxP2KZ-jG2pMAuSa2F4AaABAg,@mkhbell,https://yt3.ggpht.com/ytc/AIdro_kBgG294MFyAxMA...,http://www.youtube.com/@mkhbell,UCM3bebzFxzA90DjuGg1svxg,give stolen land back palestinian people want ...,2024-11-09T08:51:30Z,2024-11-09T08:51:30Z,0,none,True,False,,UC92qc2WwrEdmJ7ThqGR9yfw,__label__en,[],[],[],"(None, None)"
1837747,OUGcrK9fLOo,UgxLpFjDDbR7sdFb1x14AaABAg,@Supablaze-the-first,https://yt3.ggpht.com/ytc/AIdro_nn2r27xgt-QLsA...,http://www.youtube.com/@Supablaze-the-first,UCWUB2Ck82PxM8xyx7QVKpdw,sadly shae might want good day look actually o...,2024-11-05T18:00:31Z,2024-11-05T18:01:07Z,0,none,True,False,,UCBxQIdDzaiQ6u0GQY609EOQ,__label__en,[],[],[],"(None, None)"
1787926,GZcTxRmiPEc,UgynN8lIJeSla5KRfnp4AaABAg,@mr.kartiksahoo2350,https://yt3.ggpht.com/ytc/AIdro_kexSU46w6cYLAw...,http://www.youtube.com/@mr.kartiksahoo2350,UCMrCCqt2AEnvv2o8DHaubpQ,first full arranged unemployment many developm...,2024-11-05T17:49:26Z,2024-11-05T17:49:26Z,0,none,True,False,,UCRgMIwmmh1-2k5HeTQ2cdkQ,__label__en,[],[],[],"(None, None)"
1787927,GZcTxRmiPEc,Ugw56Cp1_PomF7wFI094AaABAg,@AnitaBalhara-n4t,https://yt3.ggpht.com/ytc/AIdro_lE-2v4pt1Am5Vt...,http://www.youtube.com/@AnitaBalhara-n4t,UCD9QWOWXq5H15343K9UW7lg,bjp haryana airport kyu nahi bana rahi hai aad...,2024-11-05T15:27:54Z,2024-11-05T15:27:54Z,1,none,True,False,,UCRgMIwmmh1-2k5HeTQ2cdkQ,__label__en,[],[],[],"(None, None)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1777912,E3ln_YTXra8,UgyCGlifSKOqlNnfcmp4AaABAg,@paulbernacki641,https://yt3.ggpht.com/ytc/AIdro_lXZATiSU94AudK...,http://www.youtube.com/@paulbernacki641,UCou7QjOkBW7XBt9zEcqOrMQ,righteous secular jew living manhattan hiding ...,2024-07-15T01:23:40Z,2024-07-15T01:23:40Z,11,none,True,False,,UCoi5pABIDpya7N5OdDnDScg,__label__en,[],[],[],"(None, None)"
1787916,oQmJJt8QK8A,Ugzg0lGjG_StDTL0p3B4AaABAg,@exiledfrommyself,https://yt3.ggpht.com/ytc/AIdro_n_yO2nL6u-QA0W...,http://www.youtube.com/@exiledfrommyself,UCXwxiD_hbbSmIqUtNbeDcyA,seems like monitor need monitor monitor comple...,2024-07-15T01:09:07Z,2024-07-15T01:09:07Z,39,none,True,False,,UC-3jIAlnQmbbVMV6gR7K8aQ,__label__en,[],[],[],"(None, None)"
1777921,E3ln_YTXra8,UgykRu7btKYxl1N3wDN4AaABAg,@FaycalOuazine,https://yt3.ggpht.com/1buFsiRLhPKrvoRlBO0R3UH4...,http://www.youtube.com/@FaycalOuazine,UCDishC6v1o7peNGx6ph_Phg,still dont think israel evil thats israel blam...,2024-07-15T01:04:07Z,2024-07-15T01:04:07Z,3,none,True,False,,UCoi5pABIDpya7N5OdDnDScg,__label__en,[],[],[],"(None, None)"
1787920,oQmJJt8QK8A,UgxNrTwTQMn3qdfcwnd4AaABAg,@ComradeFromRhody401,https://yt3.ggpht.com/7gbMd2lYj-EU0W3eWjx74ZQS...,http://www.youtube.com/@ComradeFromRhody401,UCivqlQc0Af97SUzfbgkzQFw,great segment thanks ryan would love see bpmr ...,2024-07-15T00:29:13Z,2024-07-15T00:29:13Z,18,none,True,False,,UC-3jIAlnQmbbVMV6gR7K8aQ,__label__en,[],[],[],"(None, None)"


In [42]:
import pandas as pd

def compare_dataframes(df1, df2, columns):
    """
    Compara dois DataFrames com base na quantidade de valores nulos, não nulos e valores únicos nas colunas especificadas.

    Parâmetros:
    - df1, df2: DataFrames a serem comparados.
    - columns: Lista de colunas a serem analisadas.

    Retorna:
    - Um dicionário com DataFrames para o resumo comparativo das métricas no formato especificado.
    """
    # Listas para armazenar os resultados de nulos, não nulos e valores únicos
    non_null_data = []
    null_data = []
    unique_data = []

    # Itera sobre as colunas para calcular métricas
    for col in columns:
        # Cálculo dos valores não nulos
        non_null_row = {
            "Coluna": col,
            "Original": df1[col].notna().sum(),
            "Após Tratamento": df2[col].notna().sum(),
            "Diferença": df1[col].notna().sum() - df2[col].notna().sum()
        }
        non_null_data.append(non_null_row)

        # Cálculo dos valores nulos
        null_row = {
            "Coluna": col,
            "Original": df1[col].isna().sum(),
            "Após Tratamento": df2[col].isna().sum(),
            "Diferença": df1[col].isna().sum() - df2[col].isna().sum()
        }
        null_data.append(null_row)

        # Cálculo dos valores únicos
        unique_row = {
            "Coluna": col,
            "Original": df1[col].nunique(),
            "Após Tratamento": df2[col].nunique(),
            "Diferença": df1[col].nunique() - df2[col].nunique()
        }
        unique_data.append(unique_row)

    # Converte as listas em DataFrames para os nulos, não nulos e valores únicos
    non_null_df = pd.DataFrame(non_null_data)
    null_df = pd.DataFrame(null_data)
    unique_df = pd.DataFrame(unique_data)

    # Retorna um dicionário com os DataFrames no formato desejado
    return {
        "Valores Não Nulos": non_null_df,
        "Valores Nulos": null_df,
        "Valores Únicos": unique_df
    }

In [43]:
## Estatísticas Atualizadas

# df1 e df2 são os seus DataFrames de entrada
columns = ['comment_id', 'author_channel_id', 'video_id', 'channel_id']
resultado_comparacao = compare_dataframes(original_comments, palestine_df_comments, columns)

# Exibindo os resultados
print("=== Valores Não Nulos ===")
print(resultado_comparacao["Valores Não Nulos"])

print("\n=== Valores Nulos ===")
print(resultado_comparacao["Valores Nulos"])

print("\n=== Valores Únicos ===")
print(resultado_comparacao["Valores Únicos"])

=== Valores Não Nulos ===
              Coluna  Original  Após Tratamento  Diferença
0         comment_id   1851767           707314    1144453
1  author_channel_id   1851767           707314    1144453
2           video_id   1851767           707314    1144453
3         channel_id   1851767           707314    1144453

=== Valores Nulos ===
              Coluna  Original  Após Tratamento  Diferença
0         comment_id         0                0          0
1  author_channel_id         0                0          0
2           video_id         0                0          0
3         channel_id         0                0          0

=== Valores Únicos ===
              Coluna  Original  Após Tratamento  Diferença
0         comment_id   1850742           707314    1143428
1  author_channel_id    761091           329397     431694
2           video_id      2491             2031        460
3         channel_id       836              585        251


# Exportação de Dados

In [44]:
palestine_df_comments.to_csv('{bp}/{country}/{country}_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')
df_videos.to_csv('{bp}/{country}/{country}_videos_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')
df_channels.to_csv('{bp}/{country}/{country}_channels_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')