In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import quote

### Function for page request

In [2]:
def get_content(url) :
  page = requests.get(url)
  soup = BeautifulSoup(page.text, "html.parser")
  soup_content = soup.find_all("div", class_= "container")[3]
  return soup_content

### Class News

In [3]:
class News:
  def __init__(self, judul, narasi, tanggal, sumber, kategori, penulis) :
    self.judul = judul
    self.narasi = narasi
    self.tanggal = tanggal
    self.sumber = sumber
    self.kategori = kategori
    self.penulis = penulis
  
  def __str__(self) :
    return f"""
    {{  
      Judul : {self.judul},\n
      Narasi : {self.narasi},\n
      Tanggal : {self.tanggal},\n
      Sumber : {self.sumber}, \n
      Kategori : {self.kategori}, \n
      Penulis : {self.penulis}\n
    }}
    """

### Function to replace \<br> with \n

In [4]:
def handle_breaks(element) :
  for br in element.find_all("br"):
    br.replace_with("\n")

### Function to get each news' attributes

In [5]:
def get_news_attributes(link) :
  # fetch link
  get_url = requests.get(link)
  news_soup = BeautifulSoup(get_url.text, "html")

  # judul
  judul = news_soup.find("h1", class_="detail__title")
  if judul :
    judul = judul.text.strip()
  else :
    judul = ""
  
  # narasi
  [strong.decompose() for strong in news_soup.find_all("strong")]
  [div_paradetail.decompose() for div_paradetail in news_soup.find_all("div", class_="paradetail")]
  [div_lihatjg.decompose() for div_lihatjg in news_soup.find_all("div", class_="lihatjg")]
  [div_nav.decompose() for div_nav in news_soup.find_all("div", class_="nav")]
  [div_detail.decompose() for div_detail in news_soup.find_all("div", class_="detail__multiple")]
  [table_linksisip.decompose() for table_linksisip in news_soup.find_all("table", class_="linksisip")]
  [video.decompose() for video in news_soup.find_all("video")]
  [ads.decompose() for ads in news_soup.find_all("div", class_="parallaxindetail scrollpage" )]
  [pemiluads.decompose() for pemiluads in news_soup.find_all("div", class_="cb-pemilu")]

  narasi = news_soup.find("div", class_="detail__body-text")
  if narasi :
    narasi = narasi.text.strip()
  else :
    narasi = ""

  # tanggal
  tanggal = news_soup.find("div", class_="detail__date")
  if tanggal :
    tanggal = tanggal.text.strip()
  else : tanggal = ""

  # sumber
  sumber = "detik" 

  # kategori
  kategori = news_soup.find("span", class_="detail__label")
  if kategori :
    kategori = kategori.text.strip()
  else : kategori = ""

  # penulis
  penulis_span = news_soup.find("span", class_="detail__label")
  if penulis_span : penulis_span.decompose()
  penulis = news_soup.find("div", class_="detail__author")
  if penulis :
    penulis = penulis.text
  else : penulis = ""
  
  return News(judul, narasi, tanggal, sumber, kategori, penulis)

### Function to extract individual news' link

In [6]:
def extract_individual_link(element) :
  articles = element.find_all("article", class_="list-content__item")
  links = []
  for article in articles :
    link = article.find("a", class_="media__link").get("href")
    links.append(link)
  return links

### Function to make a new dataframe

In [7]:
def create_dataframe() :
  columns = ["judul", "narasi", "tanggal", "sumber", "kategori", "penulis"]
  df = pd.DataFrame(columns = columns)
  return df

### Function to insert all news attributes to dataframe

In [8]:
def insert_news(df, news) :
  new_row = {
    "judul" : news.judul,
    "narasi" : news.narasi,
    "tanggal" : news.tanggal,
    "sumber" : news.sumber,
    "kategori" : news.kategori,
    "penulis" : news.penulis
  }
  df.loc[len(df)] = new_row

### Function to create list of dates starting from start_date to end_date

In [9]:
def create_date_list(start_date, end_date) :
  return [quote(date.strftime("%m/%d/%Y"), safe="") for date in pd.date_range(start = start_date, end = end_date)]

### Function to download CSV

In [10]:
def download_csv(df) :
  df.to_csv(r'C:/Users/Acer/Desktop/Skripsi/Code/program/detikcom-6.csv', encoding="utf-8-sig")

## Main

In [12]:
df = create_dataframe()
for date in create_date_list("8/22/2023","8/31/2023"):
  page_content = get_content(f"https://news.detik.com/indeks?date={date}")
  news_links = extract_individual_link(page_content)
  for news_link in news_links :
    attr = get_news_attributes(news_link)
    insert_news(df, attr)

In [None]:
display(df)

In [14]:
download_csv(df)