In [3]:
!pip install schedule

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting schedule
  Downloading schedule-1.2.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.0


In [4]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import schedule
import time
import os
from tqdm import tqdm

In [5]:
REPO_DIR = '/content/drive/MyDrive/pantanal.dev/artificial-intelligence'

In [8]:
os.chdir(REPO_DIR)

# Raspagem das notícias

In [None]:
# URLs para notícias do Google News em português e inglês
pt_url = 'https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JYQjBMVUpTR2dKQ1VpZ0FQAQ?hl=pt-BR&gl' \
         '=BR&ceid=BR%3Apt-419 '
eng_url = 'https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US' \
          '&ceid=US%3Aen '

In [None]:
def get_news(url, lang):
    # Envia uma solicitação para a URL e verifica a resposta
    response = requests.get(url)
    response.raise_for_status()

    registry = []

    # Analisa o conteúdo HTML da página
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article', {'class': 'UwIKyb'})
    for article in articles:
        url = article.find('a', {'class': 'WwrzSb'})['href']
        font_icon = article.find('img', {'class': 'qEdqNd'})['src']
        font = article.find('span', {'class': 'vr1PYe'}).text
        text = article.find('h4', {'class': 'gPFEn'}).text
        datetime = article.find('time', {'class': 'hvbAAd'})['datetime']

        registry.append({
            'url': url,
            'font_icon': font_icon,
            'font': font,
            'text': text,
            'datetime': datetime,
            'lang': lang
        })

    registries_df = pd.DataFrame(registry)

    # Verifica se o arquivo CSV já existe e atualiza com os novos dados
    if os.path.isfile('google-news.csv'):
        df = pd.read_csv('google-news.csv', sep='|')
        print('Loaded google-news.csv | Length:', len(df))
        concat_df = pd.concat([df, registries_df], ignore_index=True)
        concat_df.drop_duplicates(subset=['text'], inplace=True, ignore_index=True)
        concat_df.dropna(inplace=True)
        new_registries_count = np.absolute(len(concat_df) - len(df))
    else:
        concat_df = pd.DataFrame(registry)
        new_registries_count = len(concat_df)

    # Salva os dados atualizados no arquivo CSV
    if new_registries_count > 0:
        filename = 'google-news.csv'
        concat_df.to_csv(filename, index=False, sep='|')

    timestamp = pd.Timestamp.now().strftime('%H:%M:%S')
    print(
        f'{timestamp} - Saved new {new_registries_count} {lang} registries to DataFrame | DF length: {len(concat_df)}')

In [None]:
# Agenda o raspador para executar a cada 1 minuto para cada idioma
schedule.clear()
schedule.every(1).minutes.do(get_news, pt_url, 'ptbr')
schedule.every(1).minutes.do(get_news, eng_url, 'eng')

Every 15 minutes do get_news('https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen ', 'eng') (last run: [never], next run: 2023-05-03 22:31:38)

In [None]:
# Executa o raspador em um loop infinito
while True:
    # Executa as tarefas agendadas
    schedule.run_pending()

    # Calcula o tempo restante até a próxima tarefa
    remaining = int(schedule.idle_seconds())
    minutes, seconds = divmod(remaining, 60)

    # Exibe uma barra de progresso com o tempo restante
    for i in tqdm(range(remaining), desc='Time remaining', unit='s', leave=False):
        time.sleep(1)

# Tratamento do dataset

In [11]:
df = pd.read_excel('datasets/google-news/google-news-labelled.xlsx')
df

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,url,font_icon,font,text,datetime,lang,label
0,./articles/CBMiggFodHRwczovL3d3dy5jb2luZGVzay5...,https://encrypted-tbn0.gstatic.com/faviconV2?u...,CoinDesk,JPMorgan Chase to Take Over Most of First Repu...,2023-05-01 13:00:00,eng,Neutral
1,./articles/CBMia2h0dHBzOi8vd3d3LmJsb29tYmVyZy5...,https://encrypted-tbn1.gstatic.com/faviconV2?u...,Bloomberg,"Fed Should End 'Crazy' Policy Tightening, Says...",2023-05-01 13:44:16,eng,negative
2,./articles/CBMifmh0dHBzOi8vd3d3Lm1hcmtldHdhdGN...,https://encrypted-tbn1.gstatic.com/faviconV2?u...,MarketWatch,J&J's Kenvue spinoff to hit IPO market this we...,2023-05-01 17:49:00,eng,positive
3,./articles/CCAiCzI1WVBrMkNBV2Z3mAEB?hl=en-US&g...,https://yt3.ggpht.com/0JXei0z2tMwue1et0rF3_QQt...,Bloomberg Technology,'Bloomberg Technology' Full Show (05/01/2023),2023-05-01 19:46:13,eng,neutral
4,./articles/CBMicmh0dHBzOi8vZmVkZXJhbG5ld3NuZXR...,https://encrypted-tbn0.gstatic.com/faviconV2?u...,Federal News Network,Two keys to establishing a comprehensive cyber...,2023-05-01 20:26:15,eng,neutral
...,...,...,...,...,...,...,...
1995,./articles/CBMigAFodHRwczovL3ZhbG9yaW52ZXN0ZS5...,https://encrypted-tbn0.gstatic.com/faviconV2?u...,Valor Investe,Petrobras (PETR3;PETR4) traz números operacion...,2023-05-04 14:51:48,ptbr,Neutral
1996,./articles/CBMiVmh0dHBzOi8vcG9ydGFsZGJvLmNvbS5...,https://encrypted-tbn1.gstatic.com/faviconV2?u...,Portal DBO,"Lucro da Mosaic cai 63,2% no 1º trimestre, par...",2023-05-04 14:53:00,ptbr,negative
1997,./articles/CBMieWh0dHBzOi8vd3d3LmluZm9tb25leS5...,https://encrypted-tbn1.gstatic.com/faviconV2?u...,InfoMoney,GPA (PCAR3) mira retomada das margens e estima...,2023-05-04 15:12:48,ptbr,positive
1998,./articles/CBMiYmh0dHBzOi8vd3d3LmVzdGFkYW8uY29...,https://encrypted-tbn3.gstatic.com/faviconV2?u...,Economia & Negócios Estadão,GPA tem prejuízo líquido de R$ 248 milhões no ...,2023-05-04 15:24:31,ptbr,negative


In [12]:
df.drop_duplicates(subset='text', inplace=True)
df.dropna(inplace=True)
df.shape

(2000, 7)

In [13]:
df['lang'].value_counts()

eng     1230
ptbr     770
Name: lang, dtype: int64

In [14]:
df['font'].value_counts()

Yahoo Finance              101
CNBC                        95
InfoMoney                   52
Bloomberg                   50
Investing.com Brasil        48
                          ... 
Nextgov                      1
Bozeman Daily Chronicle      1
PCMag                        1
Denver7                      1
Portal DBO                   1
Name: font, Length: 530, dtype: int64

In [15]:
label_dict = {
    'positive': 2,
    'neutral': 1,
    'negative': 0
}

df['label'] = df['label'].apply(lambda label: label_dict[label.lower()])

In [16]:
df['label'].value_counts()

1    908
0    579
2    513
Name: label, dtype: int64

In [17]:
df.to_csv('datasets/google-news/google-news-labelled.csv', sep='|', index=False)