In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import random

def crawl_kompas_gaza(max_pages=200):
    articles = []
    base_url = "https://www.kompas.com/tag/gaza?page={}"

    for page in tqdm(range(1, max_pages + 1), desc="Crawling Kompas Gaza"):
        url = base_url.format(page)
        resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(resp.text, 'html.parser')

        items = soup.select("div.articleItem")
        if not items:
            print(f"[!] Tidak ada artikel di halaman {page}, berhenti.")
            break

        for item in items:
            title_tag = item.select_one("h2.articleTitle")
            link_tag = item.select_one("a.article-link")

            if not title_tag or not link_tag:
                continue

            title = title_tag.get_text(strip=True)
            link = link_tag.get("href")

            # skip video
            if "video.kompas.com" in link:
                continue

            # ambil isi artikel
            try:
                art_resp = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
                art_soup = BeautifulSoup(art_resp.text, 'html.parser')

                body_parts = art_soup.select("div.read__content p")
                body = " ".join(p.get_text(strip=True) for p in body_parts)

                date_tag = art_soup.select_one("div.read__time")
                date = date_tag.get_text(strip=True) if date_tag else "Unknown"

                if len(body) > 150:
                    articles.append({
                        "title": title,
                        "body": body,
                        "date": date
                    })

                # delay acak biar lebih natural
                time.sleep(random.uniform(0.4, 0.9))

            except Exception as e:
                continue

        # istirahat tiap 10 halaman
        if page % 10 == 0:
            time.sleep(random.uniform(2, 5))

    df = pd.DataFrame(articles)
    df.to_csv("kompas_gaza_articles.csv", index=False, encoding="utf-8-sig")
    print(f"\nSelesai! {len(df)} artikel disimpan ke 'kompas_gaza_articles.csv'")

# Jalankan
crawl_kompas_gaza(max_pages=300)

Crawling Kompas Gaza: 100%|██████████| 300/300 [49:51<00:00,  9.97s/it]



Selesai! 2077 artikel disimpan ke 'kompas_gaza_articles.csv'
