In [None]:
## Observasi HTML URL yang ingin discrapping

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

# Inisialisasi driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [2]:
# Buka Page 1 URL
url = "https://www.cnnindonesia.com/search/?query=prolanis&page=1"
driver.get(url)

In [3]:
# Tunggu beberapa detik agar konten JavaScript dimuat
time.sleep(5)

In [4]:
# Ambil konten halaman
page_source = driver.page_source

In [5]:
# Parse dengan BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

In [6]:
soup

<html class="scroll-smooth scroll-pt-[88px]" id="anchor" lang="id-ID"><head>
<title>Hasil Pencarian - Berita Harian Prolanis - CNN Indonesia</title>
<link href="https://cdn.cnnindonesia.com" rel="dns-prefetch"/>
<link href="https://cdn.detik.net.id" rel="dns-prefetch"/>
<link href="https://securepubads.g.doubleclick.net" rel="dns-prefetch"/>
<link href="https://cdnstatic.detik.com" rel="dns-prefetch"/>
<link href="https://akcdn.detik.net.id" rel="dns-prefetch"/>
<link href="https://www.gstatic.com" rel="dns-prefetch"/>
<link href="https://www.google-analytics.com" rel="dns-prefetch"/>
<link href="https://partner.googleadservices.com" rel="dns-prefetch"/>
<link href="https://connect.detik.com" rel="dns-prefetch"/>
<link href="https://www.googletagmanager.com" rel="dns-prefetch"/>
<link href="https://pubads.g.doubleclick.net" rel="dns-prefetch"/>
<link href="https://analytic.detik.com" rel="dns-prefetch"/>
<link href="https://newcomment.detik.com" rel="dns-prefetch"/>
<link href="https:/

In [7]:
# Membuat dan mengeksekusi Fungsi Scrapping

In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd

# Inisialisasi driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Fungsi untuk scraping satu halaman
def scrape_page(url):
    driver.get(url)
    time.sleep(5)  # Tunggu konten JavaScript dimuat
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    scraped_data = []
    
    articles = soup.find_all('article', class_='flex-grow')
    for article in articles:
        link_element = article.find('a', class_='flex')
        link = link_element['href'] if link_element else ""
        
        title_element = article.find('h2', class_='text-cnn_black_light')
        title = title_element.text.strip() if title_element else ""
        
        category_element = article.find('span', class_='text-xs text-cnn_red')
        category = category_element.text.strip() if category_element else ""
        
        date_element = article.find('span', class_='text-xs text-cnn_black_light3')
        date = date_element.text.strip().replace('•', '').strip() if date_element else ""
        
        scraped_data.append({
            'Judul': title,
            'Link': link,
            'Kategori': category,
            'Tanggal Upload': date
        })
    
    return scraped_data

# Inisialisasi DataFrame untuk menyimpan semua data
all_data = pd.DataFrame(columns=['Judul', 'Link', 'Kategori', 'Tanggal Upload'])

# Loop melalui halaman 1 sampai 20
for page in range(1, 3):
    url = f"https://www.cnnindonesia.com/search/?query=prolanis&page={page}"
    print(f"Scraping halaman {page}...")
    page_data = scrape_page(url)
    
    # Tambahkan data halaman ke DataFrame utama
    all_data = pd.concat([all_data, pd.DataFrame(page_data)], ignore_index=True)
    
    # Tunggu sebentar sebelum ke halaman berikutnya untuk menghindari pembatasan
    time.sleep(2)

# Tutup driver
driver.quit()

# Tampilkan hasil
print(all_data)



Scraping halaman 1...
Scraping halaman 2...
                                                Judul  \
0                                                       
1                                                       
2                                                       
3                                                       
4                                                       
5                                                       
6                                                       
7                                                       
8                                                       
9                                                       
10                                                      
11                                                      
12                                                      
13                                                      
14                                                      
15                                          

In [9]:
# Simpan hasil ke file CSV
all_data.to_csv('scraping_cnn_prolanis.csv', index=False)

In [10]:
# Membersihkan data

In [11]:
import pandas as pd
import numpy as np

# Baca file CSV
df = pd.read_csv(r'scraping_cnn_prolanis.csv', encoding='utf-8')

# Fungsi untuk mengubah string kosong menjadi NaN
def clean_data(x):
    if isinstance(x, str):
        x = x.strip()
        return np.nan if x == '' or x == '#' else x
    return x

# Terapkan fungsi clean_data ke semua sel dalam DataFrame
df = df.applymap(clean_data)

# Hapus baris yang memiliki nilai NaN di semua kolom
df = df.dropna(how='all')

# Hapus baris yang memiliki nilai NaN di kolom penting
df = df.dropna(subset=['Judul', 'Link', 'Kategori', 'Tanggal Upload'])

# Reset index setelah menghapus baris
df = df.reset_index(drop=True)

# Hapus duplikat berdasarkan kolom 'Judul' dan 'Link'
df = df.drop_duplicates(subset=['Judul', 'Link'], keep='first')

# Simpan hasil ke file CSV baru
df.to_csv('scraping_cnn_prolanis_clean2.csv', index=False, encoding='utf-8')

# Tampilkan hasil
print(df)
print(f"Jumlah baris setelah pembersihan: {len(df)}")

                                                Judul  \
0   BPJS Kesehatan Dorong Perkuat Promotif-Prevent...   
1   Bos BPJS Kesehatan Ajak Semua Pihak Gotong Roy...   
2   Pemprov Jateng Gaet BPJS Kesehatan Dorong Vaks...   
3   Vaksinasi Peserta JKN-KIS Berpenyakit Kronis B...   
4   BPJS Kesehatan Ungkap Pengelolaan Data Penting...   
5   BPJS Kesehatan Prioritaskan Kendali Mutu dan B...   
6   BPJS Kesehatan Optimal Perbaiki Kualitas Layan...   
7   BPJS Kesehatan Perkuat Langkah Promotif Preven...   
8   Mobile JKN Sediakan Fitur Konsultasi Personal ...   
9   Aplikasi Mobile JKN Permudah Dokter Pantau Pasien   
10    IDI Ajak Masyarakat Gunakan Aplikasi Mobile JKN   
11  Diadukan ke PBB, BPJS Kesehatan Koreksi Data K...   

                                                 Link    Kategori  \
0   https://www.cnnindonesia.com/ekonomi/202111151...     Ekonomi   
1   https://www.cnnindonesia.com/ekonomi/202109251...     Ekonomi   
2   https://www.cnnindonesia.com/ekonomi/202108271.