In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import quote

### Function for page request

In [2]:
def get_content(url) :
  page = requests.get(url)
  soup = BeautifulSoup(page.text, "html.parser")
  soup_content = soup.find("div", class_= "latest--news")
  return soup_content

### Class News

In [3]:
class News:
  def __init__(self, judul, narasi, tanggal, sumber, kategori, penulis) :
    self.judul = judul
    self.narasi = narasi
    self.tanggal = tanggal
    self.sumber = sumber
    self.kategori = kategori
    self.penulis = penulis
  
  def __str__(self) :
    return f"""
    {{  
      Judul : {self.judul},\n
      Narasi : {self.narasi},\n
      Tanggal : {self.tanggal},\n
      Sumber : {self.sumber}, \n
      Kategori : {self.kategori}, \n
      Penulis : {self.penulis}\n
    }}
    """

### Function to replace \<br> with \n

In [4]:
def handle_breaks(element) :
  for br in element.find_all("br"):
    br.replace_with("\n")

### Function to get each news' attributes

In [14]:
def get_news_attributes(link) :
  # fetch link
  get_url = requests.get(link)
  news_soup = BeautifulSoup(get_url.text, "html")

  # judul
  judul = news_soup.find("h1", class_="read__title")
  if judul :
    judul = judul.text.strip()
  else :
    judul = ""
  
  # narasi
  [strong.decompose() for strong in news_soup.find_all("strong")]

  # decompose elements after "EndOfArticle"
  end_of_article = news_soup.find("div", id="EndOfArticle")
  if end_of_article :
    end_of_article.find_next_sibling().decompose()
  
  narasi_content = news_soup.find("div", class_="read__content")
  narasi = ""
  if narasi_content :
    narasi = narasi_content.find("div", class_="clearfix")
  if narasi :
    narasi = narasi.text.strip()
  else :
    narasi = ""

  # tanggal
  tanggal = news_soup.find("div", class_="read__time")
  tanggal.find("a").decompose()
  if tanggal :
    tanggal = tanggal.text.strip()
  else : tanggal = ""

  # sumber
  sumber = "kompas" 

  # kategori
  breadcrumb_all = news_soup.find_all("li", class_="breadcrumb__item")
  kategori = breadcrumb_all[len(breadcrumb_all)-1].find(attrs={"itemprop" : "name"})

  if kategori :
    kategori = kategori.text.strip()
  else : kategori = ""

  # # penulis
  penulis = news_soup.find("div", class_="credit-title-name")
  if penulis :
    penulis = penulis.text
  else : penulis = ""
  
  return News(judul, narasi, tanggal, sumber, kategori, penulis)

### Function to extract individual news' link

In [7]:
def extract_individual_link(element) :
  articles = element.find_all("div", class_="article__list clearfix")
  links = []
  for article in articles :
    link = article.find("a", class_="article__link").get("href")
    links.append(link)
  return links

### Function to make a new dataframe

In [8]:
def create_dataframe() :
  columns = ["judul", "narasi", "tanggal", "sumber", "kategori", "penulis"]
  df = pd.DataFrame(columns = columns)
  return df

### Function to insert all news attributes to dataframe

In [9]:
def insert_news(df, news) :
  new_row = {
    "judul" : news.judul,
    "narasi" : news.narasi,
    "tanggal" : news.tanggal,
    "sumber" : news.sumber,
    "kategori" : news.kategori,
    "penulis" : news.penulis
  }
  df.loc[len(df)] = new_row

### Function to create list of dates starting from start_date to end_date

In [2]:
def create_date_list(start_date, end_date) :
  return [quote(date.strftime("%Y-%m-%d"), safe="") for date in pd.date_range(start=start_date, end=end_date)]

### Function to download CSV

In [11]:
import os
def download_csv(df) :
  file_path_name = r'C:/Users/Acer/Desktop/Skripsi/Code/program/kompas-3.csv'
  if os.path.exists(file_path_name) :
    raise FileExistsError(f"The file already exists")
  else :
    df.to_csv(file_path_name, encoding="utf-8-sig")

## Main

In [15]:
df = create_dataframe()
for date in create_date_list("2023-01-01","2023-08-31"):
  page_content = get_content(f"https://news.kompas.com/search/{date}")
  news_links = extract_individual_link(page_content)
  for news_link in news_links :
    attr = get_news_attributes(news_link)
    insert_news(df, attr)

In [None]:
display(df)

In [17]:
download_csv(df)