In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

### Function for page request

In [2]:
def get_content(url) :
  page = requests.get(url)
  soup = BeautifulSoup(page.text, "html.parser")
  soup_content = soup.find("div", id = "main-content")
  return soup_content

### Class News

In [3]:
class News:
  def __init__(self, judul, narasi, tanggal, sumber, kategori, penulis) :
    self.judul = judul
    self.narasi = narasi
    self.tanggal = tanggal
    self.sumber = sumber
    self.kategori = kategori
    self.penulis = penulis

  def __str__(self) :
    return f"""
    {{  
      Judul : {self.judul},\n
      Narasi : {self.narasi},\n
      Tanggal : {self.tanggal},\n
      Sumber : {self.sumber}, \n
      Kategori : {self.kategori}, \n
      Penulis : {self.penulis}\n
    }}
    """

### Function to replace \<br> with \n

In [4]:
def handle_breaks(element) :
  for br in element.find_all("br"):
    br.replace_with("\n")

### Function to get each news' attributes

In [5]:
def get_news_attributes(link) :
  # fetch link
  get_url = requests.get(link)
  news_soup = BeautifulSoup(get_url.text, "html")

  # judul
  judul = news_soup.find("h1").text[8:]
  
  # narasi
  news_body = news_soup.find("div", class_="entry-content").find_all("p")
  narasi_start_idx = 0
  narasi = ""

  for idx, paragraf in enumerate(news_body) :
    # if "narasi" in paragraf.text.lower() :
    if re.search(r"=*\[?narasi\]?\s*[:]?", paragraf.text, re.I) :
      narasi_start_idx = idx
      break
      
  for idx, paragraf in enumerate(news_body[narasi_start_idx:]) :
    if "penjelasan" in paragraf.text.lower() :
      break
    if paragraf : handle_breaks(paragraf)
    
    if idx==0 :
      new_narasi = re.sub(r"=*\[?narasi\]?\s*[:]?", "", paragraf.text, flags=re.I) 
      narasi += new_narasi
    else :
      narasi+=paragraf.text

  # tanggal
  tanggal = news_soup.find("span", class_="entry-meta-date").find("a").decode_contents()

  # sumber
  sumber = ""
  for idx, paragraf in enumerate(news_body) :
    match = re.search(r"=*\[?sumber\]?\s*[:]?", paragraf.text, re.I)
    if match :
      sumber = paragraf.text[match.end():].strip()
      

  # kategori
  temp_kategori = news_soup.find(lambda tag: tag.name == "p" and "kategori" in tag.text.strip().lower())
  if temp_kategori :
    handle_breaks(temp_kategori)
    new_kategori = re.sub(r"=*\[?kategori\]?\s*[:]?", "", temp_kategori.text, flags=re.I) 
    kategori = new_kategori
  else :
    kategori = ""

  # penulis
  temp_penulis = news_soup.find("span", class_="entry-meta-author").find("a", class_="fn")
  if temp_penulis :
    penulis = temp_penulis.decode_contents()
  else :
    penulis = ""
  
  return News(judul, narasi, tanggal, sumber, kategori, penulis)

### Function to make a new dataframe

In [7]:
def create_dataframe() :
  columns = ["judul", "narasi", "tanggal", "sumber", "kategori", "penulis"]
  df = pd.DataFrame(columns = columns)
  return df

### Function to extract individual news' link

In [8]:
def extract_individual_link(element) :
  articles = element.find_all("article")
  links = []
  for article in articles :
    link = article.find("figure", class_="mh-loop-thumb").find("a").get("href")
    links.append(link)
  return links

### Function to insert all news attributes to dataframe

In [9]:
def insert_news(df, news) :
  new_row = {
    "judul" : news.judul,
    "narasi" : news.narasi,
    "tanggal" : news.tanggal,
    "sumber" : news.sumber,
    "kategori" : news.kategori,
    "penulis" : news.penulis
  }
  df.loc[len(df)] = new_row

### Function to download csv

In [10]:
def download_csv(df) :
  df.to_csv(r'C:/Users/Acer/Desktop/Skripsi/Code/program/turnbackhoax-2.csv', encoding="utf-8-sig")

# Main

In [11]:
df = create_dataframe()
for page in range(12, 17) :
  page_content = get_content(f"https://turnbackhoax.id/page/{page}")
  news_links = extract_individual_link(page_content)
  for news_link in news_links :
    attr = get_news_attributes(news_link)
    insert_news(df, attr)

In [None]:
display(df)

In [13]:
download_csv(df)