In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import random

def crawl_kompas_gaza(max_pages=200):
    articles = []
    base_url = "https://www.kompas.com/tag/gaza?page={}"

    for page in tqdm(range(1, max_pages + 1), desc="Crawling Kompas Gaza"):
        url = base_url.format(page)
        resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(resp.text, 'html.parser')

        items = soup.select("div.articleItem")
        if not items:
            print(f"[!] Tidak ada artikel di halaman {page}, berhenti.")
            break

        for item in items:
            title_tag = item.select_one("h2.articleTitle")
            link_tag = item.select_one("a.article-link")

            if not title_tag or not link_tag:
                continue

            title = title_tag.get_text(strip=True)
            link = link_tag.get("href")

            # skip video
            if "video.kompas.com" in link:
                continue

            # ambil isi artikel
            try:
                art_resp = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
                art_soup = BeautifulSoup(art_resp.text, 'html.parser')

                body_parts = art_soup.select("div.read__content p")
                body = " ".join(p.get_text(strip=True) for p in body_parts)

                date_tag = art_soup.select_one("div.read__time")
                date = date_tag.get_text(strip=True) if date_tag else "Unknown"

                if len(body) > 150:
                    articles.append({
                        "title": title,
                        "body": body,
                        "date": date
                    })

                # delay acak biar lebih natural
                time.sleep(random.uniform(0.4, 0.9))

            except Exception as e:
                continue

        # istirahat tiap 10 halaman
        if page % 10 == 0:
            time.sleep(random.uniform(2, 5))

    df = pd.DataFrame(articles)
    df.to_csv("kompas_gaza_articles.csv", index=False, encoding="utf-8-sig")
    print(f"\nSelesai! {len(df)} artikel disimpan ke 'kompas_gaza_articles.csv'")

# Jalankan
crawl_kompas_gaza(max_pages=300)

Crawling Kompas Gaza: 100%|██████████| 300/300 [49:51<00:00,  9.97s/it]



Selesai! 2077 artikel disimpan ke 'kompas_gaza_articles.csv'


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import random

def crawl_kompas_gaza(max_pages=200):
    articles = []
    base_url = "https://www.kompas.com/tag/gaza?page={}"
    for page in tqdm(range(1, max_pages + 1), desc="Crawling Kompas Gaza"):
        url = base_url.format(page)
        resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(resp.text, 'html.parser')
        items = soup.select("div.articleItem")
        if not items:
            print(f"[!] Tidak ada artikel di halaman {page}, berhenti.")
            break
        for item in items:
            title_tag = item.select_one("h2.articleTitle")
            link_tag = item.select_one("a.article-link")
            if not title_tag or not link_tag:
                continue
            title = title_tag.get_text(strip=True)
            link = link_tag.get("href")
            # skip video
            if "video.kompas.com" in link:
                continue
            # ambil isi artikel
            try:
                art_resp = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
                art_soup = BeautifulSoup(art_resp.text, 'html.parser')
                
                # Ekstrak nama penulis dari meta tag (content_author)
                author_tag = art_soup.select_one('meta[name="content_author"]')
                author = author_tag['content'] if author_tag else "Unknown"
                
                # Jika meta tag tidak ada, fallback ke class credit-title-nameEditor
                if author == "Unknown":
                    credit_tags = art_soup.select('div.credit-title-nameEditor')
                    if credit_tags:
                        author = " ".join([tag.get_text(strip=True).strip(',') for tag in credit_tags])  # Gabung jika ada multiple
                
                # Ekstrak link foto utama dari meta og:image
                image_tag = art_soup.select_one('meta[property="og:image"]')
                image_url = image_tag['content'] if image_tag else "Unknown"
                
                body_parts = art_soup.select("div.read__content p")
                body = " ".join(p.get_text(strip=True) for p in body_parts)
                date_tag = art_soup.select_one("div.read__time")
                date = date_tag.get_text(strip=True) if date_tag else "Unknown"
                if len(body) > 150:
                    articles.append({
                        "title": title,
                        "author": author,  # Kolom baru: nama penulis
                        "image_url": image_url,  # Kolom baru: link foto utama
                        "url": link,  # Kolom baru: link berita
                        "body": body,
                        "date": date
                    })
                # delay acak biar lebih natural
                time.sleep(random.uniform(0.4, 0.9))
            except Exception as e:
                continue
        # istirahat tiap 10 halaman
        if page % 10 == 0:
            time.sleep(random.uniform(2, 5))
    df = pd.DataFrame(articles)
    df.to_csv("kompas_gaza_articles.csv", index=False, encoding="utf-8-sig")
    print(f"\nSelesai! {len(df)} artikel disimpan ke 'kompas_gaza_articles.csv'")

# Jalankan
crawl_kompas_gaza(max_pages=300)

Crawling Kompas Gaza:  51%|█████▏    | 154/300 [28:18<26:50, 11.03s/it]  

[!] Tidak ada artikel di halaman 155, berhenti.

Selesai! 778 artikel disimpan ke 'kompas_gaza_articles.csv'





In [11]:
df = pd.read_csv("kompas_gaza_articles.csv")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1940 entries, 0 to 1939
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      1940 non-null   object
 1   author     1797 non-null   object
 2   image_url  1940 non-null   object
 3   url        1940 non-null   object
 4   body       1940 non-null   object
 5   date       1940 non-null   object
dtypes: object(6)
memory usage: 91.1+ KB


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import random
import os

def crawl_kompas_gaza(max_pages=300, start_page=1):
    base_url = "https://www.kompas.com/tag/gaza?page={}"
    
    # Cek jika file CSV sudah ada, load data existing untuk append
    csv_file = "kompas_gaza_articles.csv"
    if os.path.exists(csv_file):
        df_existing = pd.read_csv(csv_file, encoding="utf-8-sig")
        articles = df_existing.to_dict('records')
        print(f"Loaded {len(articles)} existing articles from '{csv_file}'. Appending new data.")
    else:
        articles = []
        print(f"Starting fresh. No existing '{csv_file}' found.")
    
    for page in tqdm(range(start_page, max_pages + 1), desc="Crawling Kompas Gaza"):
        url = base_url.format(page)
        resp = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(resp.text, 'html.parser')
        items = soup.select("div.articleItem")
        if not items:
            print(f"[!] Tidak ada artikel di halaman {page}. Lanjut ke halaman berikutnya.")
            continue  # Lanjut ke halaman berikutnya, bukan break
        for item in items:
            title_tag = item.select_one("h2.articleTitle")
            link_tag = item.select_one("a.article-link")
            if not title_tag or not link_tag:
                continue
            title = title_tag.get_text(strip=True)
            link = link_tag.get("href")
            # skip video
            if "video.kompas.com" in link:
                continue
            # ambil isi artikel
            try:
                art_resp = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
                art_soup = BeautifulSoup(art_resp.text, 'html.parser')
                
                # Ekstrak nama penulis dari meta tag (content_author)
                author_tag = art_soup.select_one('meta[name="content_author"]')
                author = author_tag['content'] if author_tag else "Unknown"
                
                # Jika meta tag tidak ada, fallback ke class credit-title-nameEditor
                if author == "Unknown":
                    credit_tags = art_soup.select('div.credit-title-nameEditor')
                    if credit_tags:
                        author = " ".join([tag.get_text(strip=True).strip(',') for tag in credit_tags])  # Gabung jika ada multiple
                
                # Ekstrak link foto utama dari meta og:image
                image_tag = art_soup.select_one('meta[property="og:image"]')
                image_url = image_tag['content'] if image_tag else "Unknown"
                
                body_parts = art_soup.select("div.read__content p")
                body = " ".join(p.get_text(strip=True) for p in body_parts)
                date_tag = art_soup.select_one("div.read__time")
                date = date_tag.get_text(strip=True) if date_tag else "Unknown"
                if len(body) > 150:
                    articles.append({
                        "title": title,
                        "author": author,  # Kolom baru: nama penulis
                        "image_url": image_url,  # Kolom baru: link foto utama
                        "url": link,  # Kolom baru: link berita
                        "body": body,
                        "date": date
                    })
                # delay acak biar lebih natural
                time.sleep(random.uniform(0.4, 0.9))
            except Exception as e:
                continue
        # istirahat tiap 10 halaman
        if page % 10 == 0:
            time.sleep(random.uniform(2, 5))
    
    # Simpan ke CSV (overwrite atau append sejak load existing)
    df = pd.DataFrame(articles)
    df.to_csv(csv_file, index=False, encoding="utf-8-sig")
    print(f"\nSelesai! {len(df)} artikel disimpan ke '{csv_file}'")

# Jalankan, mulai dari halaman yang diinginkan (misal 156 jika 155 kosong)
crawl_kompas_gaza(max_pages=300, start_page=156)

Loaded 778 existing articles from 'kompas_gaza_articles.csv'. Appending new data.


Crawling Kompas Gaza: 100%|██████████| 145/145 [34:34<00:00, 14.31s/it]


Selesai! 1940 artikel disimpan ke 'kompas_gaza_articles.csv'





In [35]:
df1 = pd.read_csv("kompas_gaza_articles.csv")
df2 = pd.read_csv("sindonews_gaza_full.csv")

In [41]:
df1.head(2)

Unnamed: 0,title,author,image_url,url,body,date
0,TNI AD Siapkan 12.000 Prajurit untuk Pasukan P...,Baharudin Al Farisi,https://asset.kompas.com/crops/in9S3T6WZntpUXz...,https://nasional.kompas.com/read/2025/11/26/09...,"JAKARTA, KOMPAS.com -Tentara Nasional Indonesi...","Kompas.com, 26 November 2025, 09:04 WIB"
1,3 Kapal Rumah Sakit TNI AL Siap Bertolak ke Gaza,Baharudin Al Farisi,https://asset.kompas.com/crops/plaL8r3pP55h4WJ...,https://nasional.kompas.com/read/2025/11/26/08...,"JAKARTA, KOMPAS.com- Kepala Dinas Penerangan A...","Kompas.com, 26 November 2025, 08:47 WIB"


In [43]:
df_merged = pd.concat([df1, df2_renamed], ignore_index=True)


In [47]:
df_merged.to_csv("gaza_news_combined.csv", index=True, encoding="utf-8-sig")