<a href="https://colab.research.google.com/github/Noertri/scraper-nusabali/blob/main/scraper_nusabali.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Menginstall package

1. upload file requirements.txt
2. jalankan sel dibawah ini

In [None]:
!pip install --upgrade -r requirements.txt

# Program

1. Import package yang diperlukan dan membuat fungsi

In [40]:
import httpx
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict
import pandas as pd
from urllib import parse
import dateparser
from datetime import datetime


client = httpx.Client(follow_redirects=True, timeout=5.)
main_url = "https://www.nusabali.com"
article_base_url = "https://www.nusabali.com/berita"
results = []


@dataclass
class Result:
    tanggal: str = ""
    judul: str = ""
    url: str = ""


@dataclass(init=False, repr=True)
class JsonToResult(Result):

    def __init__(self, article_id, slug, published_at, article_title, **kwargs):
        self.tanggal = datetime.strptime(published_at, "%Y-%m-%d %H:%M:%S").strftime("%d %B %Y %H:%M:%S")
        self.judul = article_title
        self.url = f"{article_base_url}/{article_id}/{slug}"
        self._kwargs = kwargs


def request_main_page(keyword):
    page_url = parse.urljoin(main_url, "search")

    params = {
        'keyword': f'{keyword}',
    }

    query_str = parse.urlencode(params)

    referer_url = parse.urlunsplit(("https", "www.nusabali.com", "search", query_str, ""))

    headers = {
        'authority': 'www.nusabali.com',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7,ms;q=0.6,ja;q=0.5',
        'dnt': '1',
        'referer': f'{referer_url}',
        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    }


    response = client.get(page_url, params=params, headers=headers)

    if response.status_code < 400:
        return response


def request_api(keyword, page):
    params0 = {
        'keyword': f'{keyword}',
    }

    query_str = parse.urlencode(params0)

    referer_url = parse.urlunsplit(("https", "www.nusabali.com", "search", query_str, ""))

    headers = {
        'authority': 'www.nusabali.com',
        'accept': '*/*',
        'accept-language': 'id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7,ms;q=0.6,ja;q=0.5',
        'dnt': '1',
        'referer': f'{referer_url}',
        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest',
    }

    params = {
        'search': 'true',
        'page': f'{page}',
        'keyword': f'{keyword}',
    }

    response = client.get('https://www.nusabali.com/api/berita/list', params=params, headers=headers)

    if response.status_code < 400:
        print(f"Berhasil!!! Keyword: {keyword} Halaman: {page}")
        return response


2. Topik/kata kunci artikel

In [42]:
keyword = "kependudukan"

3. scape halaman awal

In [43]:
# scrape halaman awal
print("Mengambil artikel di halaman awal...")

r = request_main_page(keyword)

print(f"Berhasil!!! Keyword: {keyword}")

soup = BeautifulSoup(r.text, "html.parser")
article_list = soup.select("div#article-list>div.row")
if article_list:
    for article in article_list:
        h5_tag = article.select_one("div.col-xs-8.col-md-8 h5")
        span_tag = article.select_one("div.col-xs-12.entry-metas>span>span")
        article_title = h5_tag.get_text(strip=True, separator=" ") if h5_tag is not None else ""
        article_url = a_tag.get("href", "") if (a_tag := h5_tag.select_one("a")) is not None else ""
        article_date = span_tag.get_text(strip=True, separator=" ") if span_tag is not None else ""
        article_date = datetime.strptime(article_date, "%d %b %Y %H:%M")

        result = Result(tanggal=article_date.strftime("%d %B %Y %H:%M:%S"), judul=article_title, url=article_url)
        results.append(asdict(result))

Mengambil artikel di halaman awal...
Berhasil!!! Keyword: kependudukan


4. scrape halaman selanjutnya (opsional)

In [44]:
import time

# num_pages: jumlah halaman yang mau discrape (1 page = 20 artikel)

num_pages = 5

print("Mengambil artikel selanjutnya...")

for i in range(1, num_pages+1):
    r = request_api(keyword, i)
    time.sleep(3)
    data = r.json()

    if data:
        for item in data.get("data", []):
            result2 = JsonToResult(**item)
            results.append(asdict(result2))

Mengambil artikel selanjutnya...
Berhasil!!! Keyword: kependudukan Halaman: 1
Berhasil!!! Keyword: kependudukan Halaman: 2
Berhasil!!! Keyword: kependudukan Halaman: 3
Berhasil!!! Keyword: kependudukan Halaman: 4
Berhasil!!! Keyword: kependudukan Halaman: 5


5. Menyimpan hasil ke file excel

In [45]:
def save_to_excel(prefix):
    filename = "{0}_{1}.xlsx".format(prefix, datetime.now().strftime("%d%m%Y%H%M%S"))

    df = pd.DataFrame(results)

    print(df.to_string())

    print(f"Menyimpan ke file {filename}...")

    try:
        df.to_excel(filename, index=False)
        print("Berhasil!!!")
    except Exception as err:
        print(err)

save_to_excel("artikel_kependudukan")

                    tanggal                                                                                             judul                                                                                                                                     url
0     28 July 2023 15:26:00                                                           Tiga Terdakwa KTP Aspal Divonis Berbeda                                                          https://www.nusabali.com/berita/147119/tiga-terdakwa-ktp-aspal-divonis-berbeda
1     27 July 2023 09:05:00                                                             Bupati Lantik Dua Pejabat Disdukcapil                                                            https://www.nusabali.com/berita/146980/bupati-lantik-dua-pejabat-disdukcapil
2     26 July 2023 15:15:00                                                          Komisi II DPR RI Dorong Program One Card                                                         https://www.nusabali.com/berita/