In [1]:
from datetime import datetime

def convert_date(date_string):
    # Parse the date string into a datetime object
    date_o = datetime.strptime(date_string, " %d/%m/%Y às %H:%M ")
    # Convert the datetime object to a string in the format suitable for the database
    db_friendly_date = date_o.strftime("%Y-%m-%d %H:%M:%S")

    return db_friendly_date

In [2]:
def find_paragraphs(post_content):

  texto_noticia = ""

  paragraphs = post_content.find_all('p', recursive=False)

  for p in paragraphs:
      # Check if the parent of the <p> tag has class 'read__too'
      parent_classes = [c.get('class', []) for c in p.parents]
      if not any('read__too' in classes for classes in parent_classes):
          texto_noticia += p.get_text() + " "

  return texto_noticia

In [3]:
def topicos_tidy(topicos_content):

  if topicos_content is None:
    return None

  #Extract the text content and split it by whitespace characters
  topicos_list = topicos_content.get_text(separator=',').split(',')

  #Remove any empty strings or whitespace-only strings from the list
  topicos_list = [topic.strip() for topic in topicos_list if topic.strip()]

  #Join the topics back together with commas
  topicos = ', '.join(topicos_list)

  return topicos

In [4]:
import requests
from bs4 import BeautifulSoup

def extractor(noticia):

  #subdivisoes de interesse da miniatura
  main_tags = noticia.find('a', attrs={'class': 'home__list__tag'})

  #title---
  title = main_tags.text
  title = title.strip()

  #subtitle---
  subtitle = main_tags.get('title')

  #date---
  date = noticia.find('span', attrs={'class': 'home__title__date'})
  date = convert_date(date.text)

  #link---
  news_link = main_tags.get('href')

  #going inside the link of the news------ANOTHER PAGE
  response_B = requests.get(news_link)
  html_content_B = response_B.content
  site_B = BeautifulSoup(html_content_B, 'html.parser')

  #autor---
  try:
    autor_tag = site_B.find('span', attrs={'class': 'author__group'}).find('a')

    if autor_tag is not None:
      autor = autor_tag.text
    else:
      autor = None
  except AttributeError as e:
    autor = None

  #texto da notícia---
  #tag com o conteudo do post
  post_content = site_B.find('div', attrs={'class': 'post__content'})
  texto_noticia = find_paragraphs(post_content)

  #Tópicos da noticia---
  topicos_content = post_content.find('ul', attrs={'class': 'tags__list'})
  topicos = topicos_tidy(topicos_content)

  #retorna os atributos de interesse
  return(title, subtitle, autor, date, texto_noticia, topicos, news_link)


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

#open CSV file in write mode
with open('Lista3_data_dm.csv', 'w', newline='', encoding='utf-8') as csvfile:
  #csv writer object
  writer = csv.writer(csvfile)
  # Write the header row
  writer.writerow(['Título', 'Subtítulo', 'Autor', 'Data', 'Conteudo_texto', 'Tópicos', 'Link'])


  for i in range(1, 78):

    page_link = f'https://www.cnnbrasil.com.br/ultimas-noticias/pagina/{i}/'
    print(page_link)

    response = requests.get(page_link)

    html_content = response.content
    site = BeautifulSoup(html_content, 'html.parser')

    #html divisão de interesse
    noticias = site.findAll('li', attrs={'class': 'home__list__item'})

    for noticia in noticias:
      A,B,C,D,E,F,G = extractor(noticia)
      writer.writerow([A, B, C, D, E, F, G])


df = pd.read_csv('Lista3_data_dm.csv')


duplicate_rows = df[df.duplicated(keep=False)]

#duplicates
print("Duplicate Rows:")
print(duplicate_rows)

#remove duplicates
df.drop_duplicates(inplace=True)

#Write the DataFrame back to the CSV file
df.to_csv('Lista3_data_dm.csv', index=False, encoding='utf-8')

https://www.cnnbrasil.com.br/ultimas-noticias/pagina/1/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/2/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/3/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/4/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/5/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/6/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/7/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/8/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/9/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/10/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/11/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/12/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/13/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/14/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/15/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/16/
https://www.cnnbrasil.com.br/ultimas-noticias/pagina/17/
https://www.cnnbrasil.com.br/ultimas-not