<a href="https://colab.research.google.com/github/VictorHugoMartins/israel_x_palestine_data_analysis/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pré-Processamento de Dados

Este arquivo prepara os dados para as fases posteriores.


# Importação de Dados

In [1]:
from google.colab import drive
drive.mount('/content/drive')

base_path = '/content/drive/My Drive/Mestrado'

country = 'palestine'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import os

def read_csv_batches_from_dir(directory):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    df_list = []
    num_files = len(all_files)  # Contador de arquivos processados

    # Lê cada arquivo e adiciona ao df_list
    for file in all_files:
        df = pd.read_csv(file)
        df_list.append(df)

    # Concatenar todos os DataFrames em um só
    final_df = pd.concat(df_list, ignore_index=True)

    # Imprime a quantidade de arquivos lidos para o diretório
    print(f"Lidos {num_files} arquivos CSV da pasta {directory}")

    return final_df

def import_data(country):
    # Diretórios contendo os arquivos CSV de comentários, vídeos e canais
    comments_dir = os.path.join(base_path, '{country}/comments_csv_batches'.format(country=country))
    videos_dir = os.path.join(base_path, '{country}/videos_csv_batches'.format(country=country))
    channels_dir = os.path.join(base_path, '{country}/channels_csv_batches'.format(country=country))

    # Ler todos os arquivos CSV dos diretórios
    df_comments = read_csv_batches_from_dir(comments_dir)
    df_videos = read_csv_batches_from_dir(videos_dir)
    df_channels = read_csv_batches_from_dir(channels_dir)

    return df_comments, df_videos, df_channels

# Exemplo de uso:
palestine_df_comments, df_videos, df_channels = import_data('palestine')

# Salva os dados originais
original_comments = palestine_df_comments

Lidos 186 arquivos CSV da pasta /content/drive/My Drive/Mestrado/palestine/comments_csv_batches
Lidos 1 arquivos CSV da pasta /content/drive/My Drive/Mestrado/palestine/videos_csv_batches
Lidos 1 arquivos CSV da pasta /content/drive/My Drive/Mestrado/palestine/channels_csv_batches


## Dados Originais

### Comments

In [3]:
palestine_df_comments

Unnamed: 0,video_id,comment_id,author,author_profile_image_url,author_channel_url,author_channel_id,comment,published_at,updated_at,like_count,viewer_rating,can_rate,is_reply,parent_id,channel_id
0,Vf5MThSniiY,UgxAlOKUSIYR4zzO5Dh4AaABAg.AAM6Ebg-s4jAAMAxwhHu7i,@jmjfanss,https://yt3.ggpht.com/x2t43hSZiQv9n5SFIByZDJSd...,http://www.youtube.com/@jmjfanss,UCY1_6lo-Yjw95MFvAkIoiwA,"I hear you, that's why I stopped voting after ...",2024-11-02T22:20:07Z,2024-11-02T22:20:07Z,1,none,True,True,UgxAlOKUSIYR4zzO5Dh4AaABAg,UCH1dpzjCEiGAt8CXkryhkZg
1,Vf5MThSniiY,Ugz1WuGNXBEdkIAk7nF4AaABAg,@jesusjaviergarcia,https://yt3.ggpht.com/ytc/AIdro_nI5jsz-0VV4h0y...,http://www.youtube.com/@jesusjaviergarcia,UCMZ6Jqx4HyXAE9-qgVUpAQA,Let the man Speak,2024-11-02T22:19:23Z,2024-11-02T22:19:23Z,0,none,True,False,,UCH1dpzjCEiGAt8CXkryhkZg
2,2SJomX0tw5E,UgzV-IIP1aPgEH0N8Rl4AaABAg.AAHRRN90CaVAAMAfqFqGyT,@anam.caballerowilson9421,https://yt3.ggpht.com/NYwbrC50fBnFHiB2RTrWHpPl...,http://www.youtube.com/@anam.caballerowilson9421,UCnGYqAfe-KVM0QxEkBeMbXA,What traitor? Have you watched Gladiator? he h...,2024-11-02T22:17:39Z,2024-11-02T22:17:39Z,0,none,True,True,UgzV-IIP1aPgEH0N8Rl4AaABAg,UCckHqySbfy5FcPP6MD_S-Yg
3,Vf5MThSniiY,UgwIo0q9vEPdyBqtCAh4AaABAg,@wardyra,https://yt3.ggpht.com/ytc/AIdro_kyx-poWpglC-tv...,http://www.youtube.com/@wardyra,UC4Yz5SaY8hHfqv71CdgnwYw,It's not good that this is a video you have to...,2024-11-02T22:15:25Z,2024-11-02T22:15:25Z,0,none,True,False,,UCH1dpzjCEiGAt8CXkryhkZg
4,YaNcow2MQA8,UgwkDideIRiMeFYOOWp4AaABAg.AAKlMjhd4PUAAMAF5RngF1,@FireAnt745,https://yt3.ggpht.com/ytc/AIdro_nj52r54LP66EYF...,http://www.youtube.com/@FireAnt745,UCmc7w-9RGbzt757OTbGxAsw,"@@cryptoreport8762 Zionism is not Judaism, it'...",2024-11-02T22:13:51Z,2024-11-02T22:13:51Z,0,none,True,True,UgwkDideIRiMeFYOOWp4AaABAg,UC7fWeaHhqgM4Ry-RMpM2YYw
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1851762,EswmEJOyaFY,UgzstlvfmPXmTWr7P7F4AaABAg,@estanislaubabosoares1677,https://yt3.ggpht.com/ytc/AIdro_k9jCJGi8-Zs-Lb...,http://www.youtube.com/@estanislaubabosoares1677,UCaBYqFZyIi7M4UtvwWw6s1Q,Lanjut om BEN. Stop gencetan senjata. 💪,2024-07-15T02:08:37Z,2024-07-15T02:08:37Z,0,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
1851763,EswmEJOyaFY,UgwslOqRFx39fwZjRGh4AaABAg,@ahandaja5425,https://yt3.ggpht.com/ytc/AIdro_lO1bg78qyz0HR_...,http://www.youtube.com/@ahandaja5425,UC6I6pE2nQiBhEfjJ5HwFTlg,Berarti idf wajar saja menembaki warga palesti...,2024-07-15T01:38:26Z,2024-07-15T01:38:26Z,0,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
1851764,EswmEJOyaFY,UgyCpVrAR2Yk5jJFwoV4AaABAg,@Jeckahmed,https://yt3.ggpht.com/k10-YF12hZqsgUjJjNCsyGKF...,http://www.youtube.com/@Jeckahmed,UCx3JwNQeg_bFgYAWsUHj-iw,Alhamdulillah,2024-07-15T01:37:22Z,2024-07-15T01:37:22Z,33,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg
1851765,EswmEJOyaFY,UgwLORvcKeOo2W9VbJd4AaABAg,@muhammadsafii8903,https://yt3.ggpht.com/ytc/AIdro_lPAu5LcUW1h_9K...,http://www.youtube.com/@muhammadsafii8903,UCfxI5fefq674l_Okk3VBTag,Alhamdulillah free palestine🇵🇸❤️,2024-07-15T01:31:47Z,2024-07-15T01:31:47Z,649,none,True,False,,UCmxAIW7RDDC88EPk4ry16Kg


In [4]:
palestine_df_comments.describe()

Unnamed: 0,like_count
count,1851767.0
mean,6.365636
std,119.9675
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,55571.0


### Vídeos

In [5]:
df_videos

Unnamed: 0,video_id,title,description,channel_id,published_at,category_id,tags,view_count,like_count,comment_count,...,scheduled_end_time,concurrent_viewers,active_live_chat_id,recording_date,topicCategories,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
0,5oiduodVVvs,Ehud Barak: Hamas’ October 7 attack exposed Is...,Former Israeli Prime Minister Ehud Barak said ...,UCR0fZh5SBxxMNYdg0VzRFkg,2024-11-03T14:14:48Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",2964,944,88,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
1,nXu-rS-W-6E,Muslim Crowd GOES SILENT as Bill Clinton revea...,Wool removed from Pres. Bill Clinton's eyes: J...,UC1EasxeXGzoXJb2y3HTMsLA,2024-11-03T13:52:26Z,25,"['Israel', 'War', 'Terror', 'Hamas', 'Jihadi',...",9138,3826,948,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
2,scDRf0Wskh0,Israel Hezbollah War LIVE | Israel Captures He...,Israel Hezbollah War LIVE | Israel Captures He...,UCef1-8eOpJgud7szVPlZQAQ,2024-11-03T12:35:09Z,25,"['news18', 'cnn news18', 'latest news', 'israe...",16534,197,8,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
3,0zM2pHTqsn0,‘Timing Left To…’ Iran Okays Israel Attack As ...,The United States Central Command announced th...,UC3prwMn9aU2z5Y158ZdGyyA,2024-11-03T12:19:42Z,25,"['iran israel war', 'israel iran war', 'iran i...",9233,759,383,...,,0,,,"['https://en.wikipedia.org/wiki/Military', 'ht...",,0,0,0,
4,uEF08Tq6A5o,What is Jabaliya refugee camp and why is Israe...,"With every war and Israeli assault on Gaza, Ja...",UCR0fZh5SBxxMNYdg0VzRFkg,2024-11-03T11:46:52Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",4029,1202,215,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2486,m44oIWP2A1c,#army #military #motivation #keşfet #israel #i...,#army #military #motivation #keşfet #israel #i...,UCvWb3kEo5D4KRgQa-JjceqA,2024-07-15T00:41:57Z,24,"['Army', 'Armyforce', 'Armylover', 'Art', 'Ani...",14312,0,9,...,,0,,,['https://en.wikipedia.org/wiki/Military'],,0,0,0,
2487,jrngJL4Iwo8,Israel Palestine Dress Up (Left or Right?) 🇮🇱🇵🇸,Ok so this is basically another one of THOSE C...,UCy6o8vCErPavp37zwSIGAJA,2024-07-15T00:33:35Z,27,[],13012,279,24,...,,0,,,[],,0,0,0,
2488,lFSr_JDUDQk,How to draw Palestine support Algeria and Bang...,How to draw Palestine support Algeria and Bang...,UChFExE69cWu7x-7FjuJDGiw,2024-07-15T00:30:20Z,28,"['How to draw Palestine flag drawing', 'Islami...",13585,631,9,...,,0,,,"['https://en.wikipedia.org/wiki/Hobby', 'https...",,0,0,0,
2489,rw-8j3U1W28,Roger Waters Criticizes UK Leaders' Support fo...,In a passionate conversation with Piers Morgan...,UCnA2ZZ_6P7DZbmUVEolAp3g,2024-07-15T00:01:26Z,22,[],11324,306,40,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,


In [6]:
df_videos.describe()

Unnamed: 0,category_id,view_count,like_count,comment_count,scheduled_end_time,concurrent_viewers,active_live_chat_id,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
count,2491.0,2491.0,2491.0,2491.0,0.0,2491.0,0.0,0.0,2491.0,2491.0,2491.0,0.0
mean,24.378563,168478.9,5012.417102,766.24448,,0.0,,,0.0,0.0,0.0,
std,3.012292,546599.8,22012.532446,2252.815496,,0.0,,,0.0,0.0,0.0,
min,1.0,1.0,0.0,1.0,,0.0,,,0.0,0.0,0.0,
25%,25.0,3834.0,96.0,9.0,,0.0,,,0.0,0.0,0.0,
50%,25.0,24420.0,649.0,99.0,,0.0,,,0.0,0.0,0.0,
75%,25.0,125813.5,3101.5,628.5,,0.0,,,0.0,0.0,0.0,
max,29.0,17358460.0,770902.0,42331.0,,0.0,,,0.0,0.0,0.0,


### Canais

In [7]:
df_channels

Unnamed: 0,channel_id,title,description,published_at,country,view_count,comment_count,subscriber_count,video_count,is_verified,keywords,profile_picture_url
0,UCkmLoDQ8AVKwEsDSJtNKmug,WOLD NEWS,Subscribe for USA,2024-08-31T21:48:41.363607Z,US,806251,0,912,373,False,"trump noticia ""kamalla harris"" vote",https://yt3.ggpht.com/VZXxIUrbv9NoSwomQz_4sOlC...
1,UCY3XuRwUrx0Tuyfxc9h-CKA,MuslimOdysseyalt,,2024-08-24T11:32:15.475473Z,,462,0,8,5,False,,https://yt3.ggpht.com/rnLjbQXTrKIW7cfHGt_rwqDV...
2,UCXBD5iG5cr4ZYZ99K-fmDHg,NDTV World,"NDTV World delivers a fresh, balanced, and inc...",2024-08-21T12:22:26.662914Z,IN,527475,0,2620,855,False,"""latest world news"" ""global news"" ""daily world...",https://yt3.ggpht.com/XPPea2c5PLsz1QmWmrYf5aSE...
3,UC-qF_LzQLDcjT2hiaz1Fs8g,Monkeypox,Channel ini berencana untuk mengupload Ismael ...,2024-08-03T02:25:35.585393Z,ID,2422978,0,4430,878,False,,https://yt3.ggpht.com/ytc/AIdro_lkd0rxDNf5fMK_...
4,UCZo9arkS0Nn-mVVaVHdwBwA,Daily Motations,Welcome to DailyMotation ! 🚀 Dive into a world...,2024-07-29T14:12:54.844809Z,DE,131345,0,510,119,False,,https://yt3.ggpht.com/AdDTFqDWIkdYwY7flduCS3p7...
...,...,...,...,...,...,...,...,...,...,...,...,...
831,UC1yBKRuGpC1tSM73A0ZjYjQ,The Young Turks,The Young Turks is the longest-running news pr...,2005-12-21T20:46:51Z,US,6913421734,0,6080000,65101,False,"news politics tyt ""young turks"" ""the young tur...",https://yt3.ggpht.com/f5y_0KmKhcvJI4fwO93TZYiv...
832,UCm7lHFkt2yB_WzL67aruVBQ,Hindustan Times,"Hindustan Times Videos bring you news, views a...",2005-11-16T02:41:16Z,IN,5669517181,0,7410000,70374,False,"""world news"" ""us news"" ""HT world"" ""Hindustan T...",https://yt3.ggpht.com/rxxycwwjFXuC-eQNBcklj4P-...
833,UC9LQwHZoucFT94I2h6JOcjw,Liverpool FC,Get closer to the Reds than anyone else!\n\nWe...,2005-10-23T01:19:05Z,GB,3031865702,0,10500000,7235,False,"Liverpool LFC ""Liverpool FC"" ""Liverpool Footba...",https://yt3.ggpht.com/XMb1CWW_li3OqjDsr6UyHdst...
834,UCupvZG-5ko_eiXAupbDfxWw,CNN,CNN is the world leader in news and informatio...,2005-10-02T16:06:36Z,,16867234203,0,17200000,167740,False,"CNN ""CNN News"" news ""breaking news""",https://yt3.ggpht.com/n5DRh94eycw6xGcOKTn6LKQw...


In [8]:
df_channels.describe()

Unnamed: 0,view_count,comment_count,subscriber_count,video_count
count,836.0,836.0,836.0,836.0
mean,643513600.0,0.0,1278458.0,16232.08134
std,2071057000.0,0.0,3543057.0,48133.011191
min,28.0,0.0,0.0,1.0
25%,321835.8,0.0,2512.5,192.75
50%,9390026.0,0.0,47300.0,698.0
75%,213898500.0,0.0,714500.0,4331.5
max,20199960000.0,0.0,38100000.0,611361.0


# Limpeza de Comentários

Redução do vocabulário dos comentários coletados. Vídeos e Canais não passam por esse processo por não serem o foco da pesquisa e servirem como bases auxiliares para o percurso principal.

## Remoção de Dados Duplicados

In [9]:
# Função para remover duplicatas e manter os registros mais recentes.
# Intuito: garantir que não foram coletados dados repetidos para diferentes países
def remove_duplicates(df, id_column, date_column):
    # Ordena por data de forma decrescente e remove duplicatas mantendo o mais recente
    df_sorted = df.sort_values(by=date_column, ascending=False)
    df_unique = df_sorted.drop_duplicates(subset=id_column, keep='first')
    return df_unique

In [10]:
# Gerar as estatísticas
palestine_df_comments = remove_duplicates(palestine_df_comments, 'comment_id', 'updated_at')

## Filtragem por idioma

Apenas comentários em inglês serão utilizados na pesquisa. Não haverá, por agora, uma abordagem multilanguage.

In [11]:
# 1. Instalar FastText
!pip install fasttext

# 2. Importar bibliotecas necessárias
import fasttext
import pandas as pd

# 3. Baixar o modelo de identificação de idiomas do FastText
# Use the raw file URL to download the binary model directly
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O lid.176.bin

--2024-11-22 21:51:21--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.238.176.126, 18.238.176.115, 18.238.176.19, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.238.176.126|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘lid.176.bin’


2024-11-22 21:51:23 (114 MB/s) - ‘lid.176.bin’ saved [131266198/131266198]



In [12]:
# Carregae o modelo de linguagem
def load_model():
    model_path = 'lid.176.bin'
    model = fasttext.load_model(model_path)
    return model

model = load_model()

# Função para identificar o idioma de um comentário
def detect_language(comment):
    # Remover quebras de linha
    if isinstance(comment, str):
      comment = comment.replace('\n', ' ').strip()  # Remove quebras de linha e espaços extras
      prediction = model.predict(comment)
      return prediction[0][0]  # Retornar o código do idioma
    else:
      return ''

def filter_by_english(df_comments, verbose=False):
  # Aplicar a função de detecção de idioma a cada comentário
  df_comments['language'] = df_comments['comment'].apply(detect_language)

  # Filtrar os comentários em inglês
  english_comments = df_comments[df_comments['language'] == '__label__en']

  if verbose:
    print(english_comments[['comment_id', 'comment']])

  return english_comments

In [13]:
palestine_df_comments = filter_by_english(palestine_df_comments, verbose=True)

                         comment_id  \
1837743  UgwciMkyTHL7zX5Bc9J4AaABAg   
1841768  UgwAEqmeCnfI7yLtiYF4AaABAg   
1841769  UgxP2KZ-jG2pMAuSa2F4AaABAg   
1841773  UgxKcWS-_u7lThB7gRp4AaABAg   
1841776  UgxJfWinU27jAG799NZ4AaABAg   
...                             ...   
1787923  UgwwKUl0XYJVEz0E-Yd4AaABAg   
1787924  UgwxbAfZN0JK6wahefl4AaABAg   
1777923  UgxuVj1oa7hFOO1E6XZ4AaABAg   
1777924  UgyCme9ns55AV0DxejF4AaABAg   
1807848  UgxW5KJPZvpwdVG282Z4AaABAg   

                                                   comment  
1837743    This is I'll ....the sample is pure master mind  
1841768  J.D.Vance is a good man. Hopefully , he will b...  
1841769  Give the stolen land back to the Palestinian p...  
1841773  While we can all love Trump, let's not forget ...  
1841776                                GOD. BLESS   MEXICO  
...                                                    ...  
1787923                  Thanks Sam so refreshing I’m home  
1787924                                    

### Filtragem de Vídeos e Canais

Aqui, calculamos o nosso universo a quantidade de canais, vídeos e comentários em inglês. Suas novas quantidades são calculadas, assim como o número de usuários comentaristas.

In [14]:
# Filtrar vídeos que possuem channel_id em df_channels e video_id em df_comments
df_videos = df_videos[
    (df_videos['video_id'].isin(palestine_df_comments['video_id']))
]

df_videos

Unnamed: 0,video_id,title,description,channel_id,published_at,category_id,tags,view_count,like_count,comment_count,...,scheduled_end_time,concurrent_viewers,active_live_chat_id,recording_date,topicCategories,processing_status,parts_total,parts_processed,time_left_ms,processing_failure_reason
0,5oiduodVVvs,Ehud Barak: Hamas’ October 7 attack exposed Is...,Former Israeli Prime Minister Ehud Barak said ...,UCR0fZh5SBxxMNYdg0VzRFkg,2024-11-03T14:14:48Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",2964,944,88,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
1,nXu-rS-W-6E,Muslim Crowd GOES SILENT as Bill Clinton revea...,Wool removed from Pres. Bill Clinton's eyes: J...,UC1EasxeXGzoXJb2y3HTMsLA,2024-11-03T13:52:26Z,25,"['Israel', 'War', 'Terror', 'Hamas', 'Jihadi',...",9138,3826,948,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
2,scDRf0Wskh0,Israel Hezbollah War LIVE | Israel Captures He...,Israel Hezbollah War LIVE | Israel Captures He...,UCef1-8eOpJgud7szVPlZQAQ,2024-11-03T12:35:09Z,25,"['news18', 'cnn news18', 'latest news', 'israe...",16534,197,8,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
3,0zM2pHTqsn0,‘Timing Left To…’ Iran Okays Israel Attack As ...,The United States Central Command announced th...,UC3prwMn9aU2z5Y158ZdGyyA,2024-11-03T12:19:42Z,25,"['iran israel war', 'israel iran war', 'iran i...",9233,759,383,...,,0,,,"['https://en.wikipedia.org/wiki/Military', 'ht...",,0,0,0,
4,uEF08Tq6A5o,What is Jabaliya refugee camp and why is Israe...,"With every war and Israeli assault on Gaza, Ja...",UCR0fZh5SBxxMNYdg0VzRFkg,2024-11-03T11:46:52Z,25,"['middle east eye', 'mee', 'mee news', 'news',...",4029,1202,215,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2486,m44oIWP2A1c,#army #military #motivation #keşfet #israel #i...,#army #military #motivation #keşfet #israel #i...,UCvWb3kEo5D4KRgQa-JjceqA,2024-07-15T00:41:57Z,24,"['Army', 'Armyforce', 'Armylover', 'Art', 'Ani...",14312,0,9,...,,0,,,['https://en.wikipedia.org/wiki/Military'],,0,0,0,
2487,jrngJL4Iwo8,Israel Palestine Dress Up (Left or Right?) 🇮🇱🇵🇸,Ok so this is basically another one of THOSE C...,UCy6o8vCErPavp37zwSIGAJA,2024-07-15T00:33:35Z,27,[],13012,279,24,...,,0,,,[],,0,0,0,
2488,lFSr_JDUDQk,How to draw Palestine support Algeria and Bang...,How to draw Palestine support Algeria and Bang...,UChFExE69cWu7x-7FjuJDGiw,2024-07-15T00:30:20Z,28,"['How to draw Palestine flag drawing', 'Islami...",13585,631,9,...,,0,,,"['https://en.wikipedia.org/wiki/Hobby', 'https...",,0,0,0,
2489,rw-8j3U1W28,Roger Waters Criticizes UK Leaders' Support fo...,In a passionate conversation with Piers Morgan...,UCnA2ZZ_6P7DZbmUVEolAp3g,2024-07-15T00:01:26Z,22,[],11324,306,40,...,,0,,,"['https://en.wikipedia.org/wiki/Politics', 'ht...",,0,0,0,


In [15]:
df_channels = df_channels[
    (df_channels['channel_id'].isin(palestine_df_comments['channel_id'])) |
    (df_channels['channel_id'].isin(palestine_df_comments['author_channel_id']))
]

df_channels

Unnamed: 0,channel_id,title,description,published_at,country,view_count,comment_count,subscriber_count,video_count,is_verified,keywords,profile_picture_url
0,UCkmLoDQ8AVKwEsDSJtNKmug,WOLD NEWS,Subscribe for USA,2024-08-31T21:48:41.363607Z,US,806251,0,912,373,False,"trump noticia ""kamalla harris"" vote",https://yt3.ggpht.com/VZXxIUrbv9NoSwomQz_4sOlC...
1,UCY3XuRwUrx0Tuyfxc9h-CKA,MuslimOdysseyalt,,2024-08-24T11:32:15.475473Z,,462,0,8,5,False,,https://yt3.ggpht.com/rnLjbQXTrKIW7cfHGt_rwqDV...
2,UCXBD5iG5cr4ZYZ99K-fmDHg,NDTV World,"NDTV World delivers a fresh, balanced, and inc...",2024-08-21T12:22:26.662914Z,IN,527475,0,2620,855,False,"""latest world news"" ""global news"" ""daily world...",https://yt3.ggpht.com/XPPea2c5PLsz1QmWmrYf5aSE...
3,UC-qF_LzQLDcjT2hiaz1Fs8g,Monkeypox,Channel ini berencana untuk mengupload Ismael ...,2024-08-03T02:25:35.585393Z,ID,2422978,0,4430,878,False,,https://yt3.ggpht.com/ytc/AIdro_lkd0rxDNf5fMK_...
4,UCZo9arkS0Nn-mVVaVHdwBwA,Daily Motations,Welcome to DailyMotation ! 🚀 Dive into a world...,2024-07-29T14:12:54.844809Z,DE,131345,0,510,119,False,,https://yt3.ggpht.com/AdDTFqDWIkdYwY7flduCS3p7...
...,...,...,...,...,...,...,...,...,...,...,...,...
831,UC1yBKRuGpC1tSM73A0ZjYjQ,The Young Turks,The Young Turks is the longest-running news pr...,2005-12-21T20:46:51Z,US,6913421734,0,6080000,65101,False,"news politics tyt ""young turks"" ""the young tur...",https://yt3.ggpht.com/f5y_0KmKhcvJI4fwO93TZYiv...
832,UCm7lHFkt2yB_WzL67aruVBQ,Hindustan Times,"Hindustan Times Videos bring you news, views a...",2005-11-16T02:41:16Z,IN,5669517181,0,7410000,70374,False,"""world news"" ""us news"" ""HT world"" ""Hindustan T...",https://yt3.ggpht.com/rxxycwwjFXuC-eQNBcklj4P-...
833,UC9LQwHZoucFT94I2h6JOcjw,Liverpool FC,Get closer to the Reds than anyone else!\n\nWe...,2005-10-23T01:19:05Z,GB,3031865702,0,10500000,7235,False,"Liverpool LFC ""Liverpool FC"" ""Liverpool Footba...",https://yt3.ggpht.com/XMb1CWW_li3OqjDsr6UyHdst...
834,UCupvZG-5ko_eiXAupbDfxWw,CNN,CNN is the world leader in news and informatio...,2005-10-02T16:06:36Z,,16867234203,0,17200000,167740,False,"CNN ""CNN News"" news ""breaking news""",https://yt3.ggpht.com/n5DRh94eycw6xGcOKTn6LKQw...


## Download de Bases Pré-Treinadas

In [16]:
!pip install nltk

import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('rslp')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Limpeza Textual


In [17]:
!pip install ftfy
!pip install apyori

import csv
import re
import ftfy
from nltk.corpus import stopwords

import pandas as pd



## Tratamento de colunas: Links, Mentions, Tópicos e Tags

In [18]:
# Função para extrair URLs
def extract_urls(comment):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.findall(url_pattern, comment)

# Função para extrair menções
def extract_mentions(comment):
    mention_pattern = r'@[\w]+'
    return re.findall(mention_pattern, comment)

# Aplicando as funções para criar as novas colunas
palestine_df_comments['urls'] = palestine_df_comments['comment'].apply(extract_urls)
palestine_df_comments['mentions'] = palestine_df_comments['comment'].apply(extract_mentions)

In [19]:
import pandas as pd
import ast

# Função para transformar os elementos em lowercase e extrair parte relevante de URLs
def process_element(element):
    element = element.lower()  # Converte para minúsculas
    if "https://en.wikipedia.org/wiki/" in element:
        # Extrai o termo após '/wiki/' na URL
        return element.split('/')[-1]
    return element

# Função para pré-processar as colunas 'topicCategories' e 'tags'
def preprocess_columns(df_videos):
    # Processa a coluna 'topicCategories'
    if 'topicCategories' in df_videos.columns:
        df_videos['topicCategories'] = df_videos['topicCategories'].apply(
            lambda x: [process_element(item) for item in ast.literal_eval(x)] if isinstance(x, str) else []
        )

    # Processa a coluna 'tags'
    if 'tags' in df_videos.columns:
        df_videos['tags'] = df_videos['tags'].apply(
            lambda x: [process_element(item) for item in ast.literal_eval(x)] if isinstance(x, str) else []
        )

    return df_videos

# Exemplo de uso:
# Preprocessa as colunas 'topicCategories' e 'tags' do df_videos
df_videos = preprocess_columns(df_videos)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_videos['topicCategories'] = df_videos['topicCategories'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_videos['tags'] = df_videos['tags'].apply(


## Limpeza de texto

In [20]:
def clean_data(text):
    text = str(text)
    text = ftfy.fix_text(text)  # Corrige erros de codificação de text
    text = text.split("#")[0]
    text = text.split("https://")[0]
    text = re.sub(r'k{2,}', '', text) # Remover a sequência  de k's
    text = re.sub(r'\d+', '', text)  # Remove números
    text = re.sub(r'@\w+\s?', '', str(text))  # Remove o padrão "@algumusuario"
    text = re.sub(r'\bRT\b', '', text, flags=re.IGNORECASE)  # Remove a sequência "RT"
    text = re.sub(r'\s?:\s?', ' ', text)  # Remove o símbolo " : "
    text = re.sub(r'https://\S+', '', text)  # Remove trechos que começam com "https://" seguidos por qualquer sequência de caracteres não espaços em branco
    text = re.sub(r'http://\S+', '', text)  # Remove trechos que começam com "http://" seguidos por qualquer sequência de caracteres não espaços em branco
    text = re.sub(r'[^\w\s]|_+', '', text)  # Remove emojis
    text = re.sub(r'"', '', text)  # Remove aspas
    text = re.sub(r'[^\w\s]', '', text)  # Remove pontuação
    text = re.sub(r'mention', '', text) # Remover a palavra "mention"
    text = re.sub(r'馃', 'c', text) # Decodificar o caractere 'ç'
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Manter apenas palavras no nosso alfabeto

    return text

def clean_data_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ):
    df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(clean_data)
  if ( output_dir ):
    df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [21]:
palestine_df_comments = clean_data_and_save(palestine_df_comments)

## Remoção de StopWords


In [22]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if isinstance(text, str):
        filtered_text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
        return filtered_text
    else:
        return text

def remove_stopword_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(remove_stopwords)

  if ( output_dir ):df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [23]:
palestine_df_comments = remove_stopword_and_save(palestine_df_comments)

## Transformação de tweets em minúsculos e lematização


In [24]:
nltk.download('punkt_tab')

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Inicializar lematizador
lemmatizer = WordNetLemmatizer()

# Função para lematização
def lemmatize_text(text):
    words = text.split()  # Separar o texto em palavras
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def preprocess_lower(text):
    if isinstance(text, str):
        tokens = nltk.word_tokenize(text.lower(), language='english')
        text = ' '.join(tokens)
        text = lemmatize_text(text)
        return text
    else:
        return text

def preprocess_lower_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): df = pd.read_csv(input_dir, encoding='utf-8')
  df['comment'] = df['comment'].apply(preprocess_lower)

  if ( output_dir): df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [25]:
palestine_df_comments = preprocess_lower_and_save(palestine_df_comments)

## Remoção de linhas em branco##


In [26]:
def remove_empty_line_and_save(df, input_dir=None, output_dir=None):
  if ( input_dir ): pd.read_csv(input_dir, encoding='utf-8')

  df.dropna(subset=['comment'], inplace=True)
  df = df[df['comment'].str.strip() != '']

  if ( output_dir ): df.to_csv(output_dir, index=False, encoding='utf-8')
  return df

In [27]:
palestine_df_comments = remove_empty_line_and_save(palestine_df_comments, output_dir='{country}_finalData.csv'.format(country='palestine'))

## Filtro por Keywords

In [28]:
import pandas as pd

# Lista de palavras-chave
key_words = [
    'Palestine', 'Israel', 'Zionism', 'Genocide', 'Jew', 'Muslim',
    'West Bank', 'Gaza', 'middle east', 'Conflict', 'Occupation',
    'Settlements', 'Intifada', 'Ceasefire', 'Hamas', 'IDF', 'Blockade',
    'Refugees', 'Peace talks', 'Jerusalem', 'Two-state solution',
    'UN Resolutions', 'Human rights', 'Military operations', 'Border',
    'Palestinian Authority', 'Apartheid', 'lebanon', 'yemen', 'syria'
]

# Normaliza a lista de palavras-chave para evitar problemas com maiúsculas/minúsculas
key_words_lower = [kw.lower() for kw in key_words]

# Tamanho antes do filtro
print("Tamanho antes do filtro:", len(palestine_df_comments))

# Filtra o DataFrame de vídeos
filtered_videos = df_videos[
    df_videos['title'].str.lower().str.contains('|'.join(key_words_lower))
]

# Mantém apenas os comentários associados aos vídeos filtrados
filtered_comments = palestine_df_comments[
    palestine_df_comments['video_id'].isin(filtered_videos['video_id'])
]

# Tamanho depois do filtro
print("Tamanho depois do filtro:", len(filtered_comments))

# Atualiza o DataFrame de comentários
palestine_df_comments = filtered_comments

Tamanho antes do filtro: 1612842
Tamanho depois do filtro: 1612842


## Filtro por tamanho do comentário

In [29]:
palestine_df_comments = palestine_df_comments[palestine_df_comments['comment'].apply(lambda x: len(x) > 25)]

In [30]:
# Remover registros com author_channel_id ou comment_id nulos
palestine_df_coments = palestine_df_comments.dropna(subset=['author_channel_id', 'comment_id'])

## Remoção de Valores Nulos

É melhor excluir os comentários que não é possível identificar o comment_id, por poderem se tratar de valores repetidos. A mesma coisa para os usuários que não possuem user_channel_id preenchido, não sendo possível identifica-los posteriormente. A coluna author_channel_id foi identificada como a chave única de usuário porque, diferente dos campos author e author_channel_url, não podem ser modificadas dentro do sistema, podendo nos levar a crer que se tratam de usuários distintos quando não o são. Dentre as demais features, não temos valores nulos.

In [31]:
duplicates = palestine_df_comments.groupby('author_channel_id').agg({
    'author': lambda x: list(x),
    'author_channel_url': lambda x: list(x)
}).reset_index()

# Filtrando apenas aqueles que têm mais de um autor ou URL
duplicates = duplicates[(duplicates['author'].apply(lambda x: len(set(x))) > 1) |
                        (duplicates['author_channel_url'].apply(lambda x: len(set(x))) > 1)]

print("\n=== Valores de author_channel_id com mais de um valor de author ou author_channel_url ===")
for index, row in duplicates.iterrows():
    print(f"\nAuthor Channel ID: {row['author_channel_id']}")
    print(f"Autores: {', '.join(set(row['author']))}")
    print(f"URLs de Channel: {', '.join(set(row['author_channel_url']))}")


=== Valores de author_channel_id com mais de um valor de author ou author_channel_url ===

Author Channel ID: UC-IpxsAsXJvpMwoZl4k4o2g
Autores: @rundmv6534, @rundmv93
URLs de Channel: http://www.youtube.com/@rundmv93, http://www.youtube.com/@rundmv6534

Author Channel ID: UC-RqsssaH_XTUIVco-YLa9g
Autores: @burakcoskun1865, @burakcoskun38
URLs de Channel: http://www.youtube.com/@burakcoskun38, http://www.youtube.com/@burakcoskun1865

Author Channel ID: UC07Ax-lQHEMIM7THzfjHKjg
Autores: @coldcoldheart2424, @thedemigoddess333
URLs de Channel: http://www.youtube.com/@thedemigoddess333, http://www.youtube.com/@coldcoldheart2424

Author Channel ID: UC1nyoFaVk1jwICBj4Ny4YYw
Autores: @Qamar_lune, @majdaakr6751
URLs de Channel: http://www.youtube.com/@Qamar_lune, http://www.youtube.com/@majdaakr6751

Author Channel ID: UC2B-LVHcVUNXBOGKAVvIVHw
Autores: @BataraGuru777, @Sosialoon777
URLs de Channel: http://www.youtube.com/@BataraGuru777, http://www.youtube.com/@Sosialoon777

Author Channel ID: 

## Resultado

In [32]:
palestine_df_comments

Unnamed: 0,video_id,comment_id,author,author_profile_image_url,author_channel_url,author_channel_id,comment,published_at,updated_at,like_count,viewer_rating,can_rate,is_reply,parent_id,channel_id,language,urls,mentions
1837743,6BO5TSRhj5g,UgwciMkyTHL7zX5Bc9J4AaABAg,@MeanMuggBeatZ.1,https://yt3.ggpht.com/8H_FcB6okKoQtsGEHOugZr3n...,http://www.youtube.com/@MeanMuggBeatZ.1,UCpKTUEyP59E2cmX_kBqRekA,ill sample pure master mind,2024-11-09T20:20:05Z,2024-11-09T20:20:05Z,0,none,True,False,,UCciZcZjvYjz7jpiCraADPHg,__label__en,[],[]
1841768,LJOGf-LgIcA,UgwAEqmeCnfI7yLtiYF4AaABAg,@IvorFreedman,https://yt3.ggpht.com/ytc/AIdro_nbIRpRoYqRgJ2s...,http://www.youtube.com/@IvorFreedman,UCFQBD7BPihZStX2Rp_r7y2Q,jdvance good man hopefully future president on...,2024-11-09T10:12:55Z,2024-11-09T10:12:55Z,0,none,True,False,,UC1EasxeXGzoXJb2y3HTMsLA,__label__en,[],[]
1841769,RF-vcOk6tug,UgxP2KZ-jG2pMAuSa2F4AaABAg,@mkhbell,https://yt3.ggpht.com/ytc/AIdro_kBgG294MFyAxMA...,http://www.youtube.com/@mkhbell,UCM3bebzFxzA90DjuGg1svxg,give stolen land back palestinian people want ...,2024-11-09T08:51:30Z,2024-11-09T08:51:30Z,0,none,True,False,,UC92qc2WwrEdmJ7ThqGR9yfw,__label__en,[],[]
1841773,RWc9EDDKYO0,UgxKcWS-_u7lThB7gRp4AaABAg,@moodiiperson7474,https://yt3.ggpht.com/ytc/AIdro_kXR_s_MBDoBqC4...,http://www.youtube.com/@moodiiperson7474,UCdlQe4A0pXYMchZvSJkJKjw,love trump let forget god still,2024-11-08T14:17:57Z,2024-11-08T14:17:57Z,0,none,True,False,,UC9Y0TYBGxEbrjEFigFldIvw,__label__en,[],[]
1817752,6SUTV7az0fc,Ugyi9u4NGsP_fuOXMml4AaABAg,@blackavenger2437,https://yt3.ggpht.com/z9ZQ2epRUOR3KSiA4uL9mGg4...,http://www.youtube.com/@blackavenger2437,UC9IYrK4JOjbNim2itFYeyrg,fact muslim added stupid class people,2024-11-06T23:37:19Z,2024-11-06T23:37:19Z,0,none,True,False,,UCzuqE7-t13O4NIDYJfakrhw,__label__en,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787919,oQmJJt8QK8A,UgyDI7ZAfobQ8COw5Dt4AaABAg,@DougGrinbergs,https://yt3.ggpht.com/ytc/AIdro_nni-_TA5kbQzmm...,http://www.youtube.com/@DougGrinbergs,UCkhcexYjJdt9LiauQQvc6aw,horrible interview audio make guest show look bad,2024-07-15T00:29:53Z,2024-07-15T00:29:53Z,2,none,True,False,,UC-3jIAlnQmbbVMV6gR7K8aQ,__label__en,[],[]
1787920,oQmJJt8QK8A,UgxNrTwTQMn3qdfcwnd4AaABAg,@ComradeFromRhody401,https://yt3.ggpht.com/7gbMd2lYj-EU0W3eWjx74ZQS...,http://www.youtube.com/@ComradeFromRhody401,UCivqlQc0Af97SUzfbgkzQFw,great segment thanks ryan would love see bpmr ...,2024-07-15T00:29:13Z,2024-07-15T00:29:13Z,18,none,True,False,,UC-3jIAlnQmbbVMV6gR7K8aQ,__label__en,[],[]
1787921,oQmJJt8QK8A,UgwMgG89FXFgtu5TUdl4AaABAg,@JazzMaven,https://yt3.ggpht.com/ytc/AIdro_nn_SRHchTA-GXK...,http://www.youtube.com/@JazzMaven,UCKjrYMo2JreGG5QXpyGhfqg,ryan grimm great journalist,2024-07-15T00:28:47Z,2024-07-15T00:28:47Z,38,none,True,False,,UC-3jIAlnQmbbVMV6gR7K8aQ,__label__en,[],[]
1787923,oQmJJt8QK8A,UgwwKUl0XYJVEz0E-Yd4AaABAg,@barryfinnerty3480,https://yt3.ggpht.com/ytc/AIdro_nlpv1AYACLaXbz...,http://www.youtube.com/@barryfinnerty3480,UC5YuZ3FM01xnMqVG673SNng,thanks sam refreshing im home,2024-07-15T00:19:53Z,2024-07-15T00:19:53Z,6,none,True,False,,UC-3jIAlnQmbbVMV6gR7K8aQ,__label__en,[],[]


In [33]:
import pandas as pd

def compare_dataframes(df1, df2, columns):
    """
    Compara dois DataFrames com base na quantidade de valores nulos, não nulos e valores únicos nas colunas especificadas.

    Parâmetros:
    - df1, df2: DataFrames a serem comparados.
    - columns: Lista de colunas a serem analisadas.

    Retorna:
    - Um dicionário com DataFrames para o resumo comparativo das métricas no formato especificado.
    """
    # Listas para armazenar os resultados de nulos, não nulos e valores únicos
    non_null_data = []
    null_data = []
    unique_data = []

    # Itera sobre as colunas para calcular métricas
    for col in columns:
        # Cálculo dos valores não nulos
        non_null_row = {
            "Coluna": col,
            "Original": df1[col].notna().sum(),
            "Após Tratamento": df2[col].notna().sum(),
            "Diferença": df1[col].notna().sum() - df2[col].notna().sum()
        }
        non_null_data.append(non_null_row)

        # Cálculo dos valores nulos
        null_row = {
            "Coluna": col,
            "Original": df1[col].isna().sum(),
            "Após Tratamento": df2[col].isna().sum(),
            "Diferença": df1[col].isna().sum() - df2[col].isna().sum()
        }
        null_data.append(null_row)

        # Cálculo dos valores únicos
        unique_row = {
            "Coluna": col,
            "Original": df1[col].nunique(),
            "Após Tratamento": df2[col].nunique(),
            "Diferença": df1[col].nunique() - df2[col].nunique()
        }
        unique_data.append(unique_row)

    # Converte as listas em DataFrames para os nulos, não nulos e valores únicos
    non_null_df = pd.DataFrame(non_null_data)
    null_df = pd.DataFrame(null_data)
    unique_df = pd.DataFrame(unique_data)

    # Retorna um dicionário com os DataFrames no formato desejado
    return {
        "Valores Não Nulos": non_null_df,
        "Valores Nulos": null_df,
        "Valores Únicos": unique_df
    }

In [34]:
## Estatísticas Atualizadas

# df1 e df2 são os seus DataFrames de entrada
columns = ['comment_id', 'author_channel_id', 'author', 'author_channel_url', 'video_id', 'channel_id']
resultado_comparacao = compare_dataframes(original_comments, palestine_df_comments, columns)

# Exibindo os resultados
print("=== Valores Não Nulos ===")
print(resultado_comparacao["Valores Não Nulos"])

print("\n=== Valores Nulos ===")
print(resultado_comparacao["Valores Nulos"])

print("\n=== Valores Únicos ===")
print(resultado_comparacao["Valores Únicos"])

=== Valores Não Nulos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id   1851767          1132801     718966
1   author_channel_id   1851767          1132801     718966
2              author   1850226          1131774     718452
3  author_channel_url   1851764          1132801     718963
4            video_id   1851767          1132801     718966
5          channel_id   1851767          1132801     718966

=== Valores Nulos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id         0                0          0
1   author_channel_id         0                0          0
2              author      1541             1027        514
3  author_channel_url         3                0          3
4            video_id         0                0          0
5          channel_id         0                0          0

=== Valores Únicos ===
               Coluna  Original  Após Tratamento  Diferença
0          comment_id   185

# Exportação de Dados

In [35]:
palestine_df_comments.to_csv('{bp}/{country}/{country}_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')
df_videos.to_csv('{bp}/{country}/{country}_videos_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')
df_channels.to_csv('{bp}/{country}/{country}_channels_finalData.csv'.format(bp=base_path, country=country), encoding='utf-8')