# News Scraping

## Extract

In [59]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_cnn(url):
    headers = {
        "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
    )
    }
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    # Judul
    judul_tag = soup.select_one("h1")
    judul = judul_tag.get_text(strip=True) if judul_tag else "Tidak ditemukan"

    # Tanggal (kadang ada di <div class=date>)
    tanggal_tag = soup.select_one("div.text-cnn_grey.text-sm.mb-4")
    tanggal = tanggal_tag.get_text(strip=True) if tanggal_tag else "Tidak ditemukan"

    author_tag = soup.select_one("figcaption")
    author = author_tag.get_text(strip=True) if author_tag else "Tidak ditemukan"

    # Isi artikel (CNN pakai class ini cukup konsisten)
    paragraf = soup.select("p")
    isi = "\n".join(p.get_text(strip=True) for p in paragraf) if paragraf else "Isi tidak ditemukan"

    return {
        "judul": judul,
        "tanggal": tanggal,
        "author" : author,
        "isi": isi
    }

# Coba artikel contoh
urls = [
    "https://www.cnnindonesia.com/nasional/20251119131113-12-1297058/refly-harun-walk-out-usai-roy-suryo-cs-ditolak-reformasi-polri-jimly",
    "https://www.cnnindonesia.com/nasional/20251119170701-20-1297199/erupsi-besar-status-gunung-semeru-meningkat-dari-waspada-ke-siaga",
    "https://www.cnnindonesia.com/nasional/20251119232202-12-1297306/kasus-korupsi-kuota-haji-kpk-sita-1-rumah-dan-sejumlah-kendaraan",
    "https://www.cnnindonesia.com/ekonomi/20251116000013-82-1295923/apa-itu-redenominasi-rupiah-yang-ubah-rp1000-jadi-rp1",
    "https://www.cnnindonesia.com/nasional/20251119140232-32-1297089/alasan-fraksi-pks-copot-mardani-ali-sera-dari-ketua-bksap-dpr",
    "https://www.cnnindonesia.com/nasional/20251119235105-20-1297307/sekitar-300-warga-dievakuasi-antisipasi-letusan-susulan-gunung-semeru",
    "https://www.cnnindonesia.com/ekonomi/20251119190500-92-1297252/bp-bumn-buka-suara-soal-isu-delisting-saham-waskita-imbas-merger",
    "https://www.cnnindonesia.com/nasional/20251119180905-12-1297237/warga-sergai-hilang-2-tahun-ditemukan-tinggal-kerangka-di-pohon-aren",
    "https://www.cnnindonesia.com/nasional/20251119165436-20-1297192/kpu-solo-klarifikasi-berkas-jokowi-baru-setahun-sudah-dimusnahkan",
    "https://www.cnnindonesia.com/ekonomi/20251119174800-85-1297227/bos-pertamina-respons-keluhan-dpr-soal-bbm-di-jatim",
    "https://www.cnnindonesia.com/ekonomi/20251119194621-532-1297275/riset-pdb-ri-lewati-titik-terendah-siap-tumbuh-52-persen-di-2026",
]

results = []

for url in urls:
    print("Scraping:", url)
    try:
        data = scrape_cnn(url)
        results.append(data)
    except Exception as e:
        results.append({
            "url": url,
            "error": str(e)
        })
    time.sleep(1)  # delay biar aman


df = pd.DataFrame(results)
df.to_csv('cnnindo.csv', index=False, encoding='utf-8-sig')

print("Done")

Scraping: https://www.cnnindonesia.com/nasional/20251119131113-12-1297058/refly-harun-walk-out-usai-roy-suryo-cs-ditolak-reformasi-polri-jimly
Scraping: https://www.cnnindonesia.com/nasional/20251119170701-20-1297199/erupsi-besar-status-gunung-semeru-meningkat-dari-waspada-ke-siaga
Scraping: https://www.cnnindonesia.com/nasional/20251119232202-12-1297306/kasus-korupsi-kuota-haji-kpk-sita-1-rumah-dan-sejumlah-kendaraan
Scraping: https://www.cnnindonesia.com/ekonomi/20251116000013-82-1295923/apa-itu-redenominasi-rupiah-yang-ubah-rp1000-jadi-rp1
Scraping: https://www.cnnindonesia.com/nasional/20251119140232-32-1297089/alasan-fraksi-pks-copot-mardani-ali-sera-dari-ketua-bksap-dpr
Scraping: https://www.cnnindonesia.com/nasional/20251119235105-20-1297307/sekitar-300-warga-dievakuasi-antisipasi-letusan-susulan-gunung-semeru
Scraping: https://www.cnnindonesia.com/ekonomi/20251119190500-92-1297252/bp-bumn-buka-suara-soal-isu-delisting-saham-waskita-imbas-merger
Scraping: https://www.cnnindonesi

OSError: Cannot save file into a non-existent directory: 'data'