# News Scraping

## Extract

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_cnn(url):
    headers = {
        "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    )
    }
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    # Judul
    judul_tag = soup.select_one("h1")
    judul = judul_tag.get_text(strip=True) if judul_tag else "Tidak ditemukan"

    # Tanggal (kadang ada di <div class=date>)
    tanggal_tag = soup.select_one("div.text-cnn_grey.text-sm.mb-4")
    tanggal = tanggal_tag.get_text(strip=True) if tanggal_tag else "Tidak ditemukan"

    author_tag = soup.select_one("figcaption")
    author = author_tag.get_text(strip=True) if author_tag else "Tidak ditemukan"

    # Isi artikel (CNN pakai class ini cukup konsisten)
    paragraf = soup.select("p")
    isi = "\n".join(p.get_text(strip=True) for p in paragraf) if paragraf else "Isi tidak ditemukan"

    return {
        "judul": judul,
        "tanggal": tanggal,
        "author" : author,
        "isi": isi
    }

# Coba artikel contoh
urls = [
    "https://www.cnnindonesia.com/nasional/20251119230659-20-1297303/marak-video-amatir-erupsi-semeru-khofifah-ingatkan-warga-jaga-jarak",
    "https://www.cnnindonesia.com/nasional/20251118080935-12-1296568/korban-bullying-tewas-dan-hilangnya-rasa-aman-di-sekolah",
    "https://www.cnnindonesia.com/nasional/20251106101927-32-1292524/gimik-politik-sidang-mkd-di-balik-sanksi-sahroni-nafa-urbach-cs",
    "https://www.cnnindonesia.com/ekonomi/20251021062509-532-1286684/melihat-ekonomi-ri-setahun-prabowo-gibran-adakah-yang-perlu-dibenahi",
    "https://www.cnnindonesia.com/olahraga/20251023055108-142-1287535/apakah-pelatih-timnas-indonesia-harus-tenar-lagi",
    "https://www.cnnindonesia.com/nasional/20251015113125-12-1284720/terapis-spa-14-tahun-tewas-faktor-ekonomi-hingga-perdagangan-anak",
    "https://www.cnnindonesia.com/ekonomi/20251016061332-532-1285042/tepatkah-purbaya-mau-hapus-utang-rp1-juta-biar-warga-bisa-ajukan-kpr",
    "https://www.cnnindonesia.com/nasional/20251009111137-32-1282674/catatan-untuk-komite-otsus-urus-papua-bareng-bkp3-pimpinan-gibran",
    "https://www.cnnindonesia.com/ekonomi/20251002061811-92-1279965/beruntun-cetak-rekor-sampai-kapan-harga-emas-terus-ngegas",
    "https://www.cnnindonesia.com/olahraga/20250924203827-142-1277412/membaca-keputusan-patrick-kluivert",
    "https://www.cnnindonesia.com/nasional/20251119131113-12-1297058/refly-harun-walk-out-usai-roy-suryo-cs-ditolak-reformasi-polri-jimly",
    "https://www.cnnindonesia.com/nasional/20251119170701-20-1297199/erupsi-besar-status-gunung-semeru-meningkat-dari-waspada-ke-siaga",
    "https://www.cnnindonesia.com/nasional/20251119232202-12-1297306/kasus-korupsi-kuota-haji-kpk-sita-1-rumah-dan-sejumlah-kendaraan",
    "https://www.cnnindonesia.com/ekonomi/20251116000013-82-1295923/apa-itu-redenominasi-rupiah-yang-ubah-rp1000-jadi-rp1",
    "https://www.cnnindonesia.com/nasional/20251119140232-32-1297089/alasan-fraksi-pks-copot-mardani-ali-sera-dari-ketua-bksap-dpr",
    "https://www.cnnindonesia.com/nasional/20251119235105-20-1297307/sekitar-300-warga-dievakuasi-antisipasi-letusan-susulan-gunung-semeru",
    "https://www.cnnindonesia.com/ekonomi/20251119190500-92-1297252/bp-bumn-buka-suara-soal-isu-delisting-saham-waskita-imbas-merger",
    "https://www.cnnindonesia.com/nasional/20251119180905-12-1297237/warga-sergai-hilang-2-tahun-ditemukan-tinggal-kerangka-di-pohon-aren",
    "https://www.cnnindonesia.com/nasional/20251119165436-20-1297192/kpu-solo-klarifikasi-berkas-jokowi-baru-setahun-sudah-dimusnahkan",
    "https://www.cnnindonesia.com/ekonomi/20251119174800-85-1297227/bos-pertamina-respons-keluhan-dpr-soal-bbm-di-jatim",
    "https://www.cnnindonesia.com/ekonomi/20251119194621-532-1297275/riset-pdb-ri-lewati-titik-terendah-siap-tumbuh-52-persen-di-2026",
    "https://www.cnnindonesia.com/internasional/20251116033359-109-1295941/rudal-ini-kota-terlarang-bagi-perokok",
    "https://www.cnnindonesia.com/internasional/20251119073920-134-1296918/dk-pbb-setuju-kirim-pasukan-asing-ke-gaza-sampai-trump-bertemu-mbs",
    "https://www.cnnindonesia.com/internasional/20251118195802-120-1296844/israel-di-atas-angin-imbas-resolusi-gaza-dk-pbb-ini-kata-netanyahu",
    "https://www.cnnindonesia.com/internasional/20251118135234-134-1296724/trump-girang-dk-pbb-setuju-kirim-pasukan-internasional-ke-gaza",
    "https://www.cnnindonesia.com/internasional/20250925125524-134-1277599/presiden-indonesia-ini-minta-hak-veto-dk-pbb-ditinjau-ulang",
    "https://www.cnnindonesia.com/olahraga/20251119074004-142-1296919/performa-indonesia-u-23-menjanjikan-emas-sea-games-2025-masih-jauh",
    "https://www.cnnindonesia.com/olahraga/20251119133352-142-1297070/daftar-top-skor-kualifikasi-piala-dunia-2026-haaland-runcing",
    "https://www.cnnindonesia.com/olahraga/20251119191344-142-1297260/8-negara-peserta-piala-dunia-2026-yang-pernah-dihajar-timnas-indonesia",
    "https://www.cnnindonesia.com/olahraga/20251119134608-142-1297076/indra-isyaratkan-pemain-abroad-timnas-u-23-belum-pasti-ke-sea-games"
]


urls = list(set(urls))

results = []

for url in urls:
    print("Scraping:", url)
    try:
        data = scrape_cnn(url)
        data["url"] = url
        results.append(data)
    except Exception as e:
        results.append({
            "url": url,
            "error": str(e)
        })
    time.sleep(1)

df = pd.DataFrame(results)
df.to_csv('cnnindo.csv', index=False, encoding='utf-8-sig')

print("Done")

Scraping: https://www.cnnindonesia.com/nasional/20251119170701-20-1297199/erupsi-besar-status-gunung-semeru-meningkat-dari-waspada-ke-siaga
Scraping: https://www.cnnindonesia.com/internasional/20251116033359-109-1295941/rudal-ini-kota-terlarang-bagi-perokok
Scraping: https://www.cnnindonesia.com/internasional/20251119073920-134-1296918/dk-pbb-setuju-kirim-pasukan-asing-ke-gaza-sampai-trump-bertemu-mbs
Scraping: https://www.cnnindonesia.com/ekonomi/20251116000013-82-1295923/apa-itu-redenominasi-rupiah-yang-ubah-rp1000-jadi-rp1
Scraping: https://www.cnnindonesia.com/nasional/20251119235105-20-1297307/sekitar-300-warga-dievakuasi-antisipasi-letusan-susulan-gunung-semeru
Scraping: https://www.cnnindonesia.com/nasional/20251119140232-32-1297089/alasan-fraksi-pks-copot-mardani-ali-sera-dari-ketua-bksap-dpr
Scraping: https://www.cnnindonesia.com/nasional/20251119131113-12-1297058/refly-harun-walk-out-usai-roy-suryo-cs-ditolak-reformasi-polri-jimly
Scraping: https://www.cnnindonesia.com/olahra