In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import random
import csv
import matplotlib.pyplot as plt
import torch
from transformers import pipeline
from collections import Counter
from wordcloud import WordCloud
import numpy as np

# Cek ketersediaan CUDA
if not torch.cuda.is_available():
    raise RuntimeError("CUDA tidak tersedia. Pastikan GPU dan driver terdeteksi.")
device = torch.device("cuda")
print(f"CUDA tersedia. Menggunakan perangkat: {torch.cuda.get_device_name(0)}")

# Path ke chromedriver
chrome_driver_path = r"C:\laragon\www\sosialmedia-monitoring\chromedriver-win64\chromedriver.exe"
service = Service(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=service)

# Inisialisasi model NLP dengan CUDA
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-multilingual-cased", device=0)

# Daftar sumber berita yang diperluas
news_sources = {
    "Detik": {
        "Terpopuler": "https://www.detik.com/terpopuler",
        "Otomotif": "https://oto.detik.com/",
        "Politik": "https://news.detik.com/politik",
        "Ekonomi": "https://finance.detik.com/",
        "Olahraga": "https://sport.detik.com/",
        "Teknologi": "https://inet.detik.com/",
        "Hiburan": "https://20.detik.com/"
    },
    "Kompas": {
        "Tren": "https://www.kompas.com/tren",
        "Otomotif": "https://otomotif.kompas.com/",
        "Politik": "https://nasional.kompas.com/politik",
        "Ekonomi": "https://money.kompas.com/",
        "Olahraga": "https://bola.kompas.com/",
        "Teknologi": "https://tekno.kompas.com/",
        "Hiburan": "https://www.kompas.com/hype"
    },
    "CNN Indonesia": {
        "Nasional": "https://www.cnnindonesia.com/nasional",
        "Otomotif": "https://www.cnnindonesia.com/otomotif",
        "Politik": "https://www.cnnindonesia.com/politik",
        "Ekonomi": "https://www.cnnindonesia.com/ekonomi",
        "Olahraga": "https://www.cnnindonesia.com/olahraga",
        "Teknologi": "https://www.cnnindonesia.com/teknologi",
        "Hiburan": "https://www.cnnindonesia.com/hiburan"
    },
    "Tempo": {
        "Terpopuler": "https://www.tempo.co/terpopuler",
        "Otomotif": "https://otomotif.tempo.co/",
        "Politik": "https://nasional.tempo.co/politik",
        "Ekonomi": "https://bisnis.tempo.co/",
        "Olahraga": "https://sport.tempo.co/",
        "Teknologi": "https://tekno.tempo.co/",
        "Hiburan": "https://cantik.tempo.co/"
    },
    "Tribunnews": {
        "Populer": "https://www.tribunnews.com/populer",
        "Otomotif": "https://www.tribunnews.com/otomotif",
        "Politik": "https://www.tribunnews.com/politik",
        "Ekonomi": "https://www.tribunnews.com/bisnis",
        "Olahraga": "https://www.tribunnews.com/sport",
        "Teknologi": "https://www.tribunnews.com/techno",
        "Hiburan": "https://www.tribunnews.com/seleb"
    },
    "Liputan6": {
        "News": "https://www.liputan6.com/news",
        "Otomotif": "https://www.liputan6.com/otomotif",
        "Politik": "https://www.liputan6.com/news/politik",
        "Ekonomi": "https://www.liputan6.com/bisnis",
        "Olahraga": "https://www.liputan6.com/bola",
        "Teknologi": "https://www.liputan6.com/tekno",
        "Hiburan": "https://www.liputan6.com/showbiz"
    },
    "Republika": {
        "Terpopuler": "https://www.republika.co.id/terpopuler",
        "Otomotif": "https://otomotif.republika.co.id/",
        "Politik": "https://nasional.republika.co.id/politik",
        "Ekonomi": "https://ekonomi.republika.co.id/",
        "Olahraga": "https://olahraga.republika.co.id/",
        "Teknologi": "https://tekno.republika.co.id/",
        "Islam": "https://islam.republika.co.id/"
    },
    "Okezone": {
        "Beranda": "https://www.okezone.com/",
        "Otomotif": "https://otomotif.okezone.com/",
        "Politik": "https://nasional.okezone.com/politik",
        "Ekonomi": "https://economy.okezone.com/",
        "Olahraga": "https://sports.okezone.com/",
        "Teknologi": "https://techno.okezone.com/",
        "Hiburan": "https://celebrity.okezone.com/"
    },
    "Suara": {
        "Terpopuler": "https://www.suara.com/terpopuler",
        "Otomotif": "https://www.suara.com/otomotif",
        "Politik": "https://www.suara.com/politik",
        "Ekonomi": "https://www.suara.com/bisnis",
        "Olahraga": "https://www.suara.com/sport",
        "Teknologi": "https://www.suara.com/tekno",
        "Hiburan": "https://www.suara.com/entertainment"
    },
    "Viva": {
        "Berita": "https://www.viva.co.id/berita",
        "Otomotif": "https://www.viva.co.id/otomotif",
        "Politik": "https://www.viva.co.id/berita/politik",
        "Ekonomi": "https://www.viva.co.id/berita/bisnis",
        "Olahraga": "https://www.viva.co.id/sport",
        "Teknologi": "https://www.viva.co.id/digital",
        "Hiburan": "https://www.viva.co.id/showbiz"
    },
    "Sindonews": {
        "Nasional": "https://nasional.sindonews.com/",
        "Otomotif": "https://otomotif.sindonews.com/",
        "Ekonomi": "https://ekonomi.sindonews.com/",
        "Olahraga": "https://sports.sindonews.com/",
        "Teknologi": "https://tekno.sindonews.com/",
        "Hiburan": "https://lifestyle.sindonews.com/"
    },
    "Antara News": {
        "Nasional": "https://www.antaranews.com/nasional",
        "Ekonomi": "https://www.antaranews.com/ekonomi",
        "Olahraga": "https://www.antaranews.com/olahraga",
        "Teknologi": "https://www.antaranews.com/teknologi",
        "Hiburan": "https://www.antaranews.com/hiburan"
    },
    "Bisnis.com": {
        "Ekonomi": "https://ekonomi.bisnis.com/",
        "Otomotif": "https://otomotif.bisnis.com/",
        "Teknologi": "https://teknologi.bisnis.com/",
        "Hiburan": "https://lifestyle.bisnis.com/"
    },
    "Jawa Pos": {
        "Nasional": "https://www.jawapos.com/nasional",
        "Ekonomi": "https://www.jawapos.com/ekonomi",
        "Olahraga": "https://www.jawapos.com/olahraga",
        "Teknologi": "https://www.jawapos.com/tekno",
        "Hiburan": "https://www.jawapos.com/entertainment"
    },
    "BBC Indonesia": {
        "Berita": "https://www.bbc.com/indonesia",
        "Internasional": "https://www.bbc.com/indonesia/internasional",
        "Teknologi": "https://www.bbc.com/indonesia/majalah"
    }
}

# Fungsi untuk scroll halaman
def scroll_to_bottom(max_scrolls=5):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(max_scrolls):
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(random.uniform(3, 7))
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Fungsi untuk mengambil judul berita
def scrape_headlines(url, source_name, category):
    driver.get(url)
    time.sleep(3)
    print(f"Memuat {source_name} - {category}...")
    scroll_to_bottom()

    selectors = {
        "Detik": ["media__title", "title", "h2", "article-title", "list-content__title", "media__link"],
        "Kompas": ["article__list__title", "title", "h3", "article-title", "headline__title", "kcm__title"],
        "CNN Indonesia": ["h2", "title", "article-title", "list__title", "nhd__title", "headline"],
        "Tempo": ["title", "h2", "h3", "article__title", "post-title", "judul"],
        "Tribunnews": ["txt-oev-2", "title", "h2", "article-title", "fbo2", "txt-oev-3"],
        "Liputan6": ["articles--iridescent-list--text-item__title", "title", "h2", "article-title", "headline", "list__title"],
        "Republika": ["h3", "title", "h2", "article-title", "headline", "post__title"],
        "Okezone": ["title", "h2", "h3", "article-title", "list-berita__title", "judul"],
        "Suara": ["inject-title", "title", "h2", "article-title", "headline", "post-title"],
        "Viva": ["title", "h2", "h3", "article-title", "headline", "news-title"],
        "Sindonews": ["title", "h2", "article-title", "headline"],
        "Antara News": ["title", "h2", "article-title", "post-title"],
        "Bisnis.com": ["title", "h2", "article-title", "headline"],
        "Jawa Pos": ["title", "h2", "article-title", "post-title"],
        "BBC Indonesia": ["title", "h2", "article-title", "headline"]
    }

    headline_list = []
    for selector in selectors.get(source_name, ["title", "h2", "h3", "article-title"]):
        try:
            headlines = driver.find_elements(By.CLASS_NAME, selector)
            if not headlines:
                headlines = driver.find_elements(By.TAG_NAME, selector)
            headline_list = [headline.text.strip() for headline in headlines if headline.text.strip()]
            if headline_list:
                break
        except Exception as e:
            print(f"Gagal menggunakan selector '{selector}' untuk {source_name} - {category}: {e}")

    print(f"Ditemukan {len(headline_list)} judul dari {source_name} - {category}")
    return headline_list

# Fungsi untuk analisis sentimen dan plotting beragam
def analyze_and_plot_data(all_headlines, keyword):
    keyword = keyword.lower()

    # Kumpulkan semua data yang relevan
    relevant_headlines = [(source, category, headline) for source, categories in all_headlines.items()
                          for category, headlines in categories.items() for headline in headlines if keyword in headline.lower()]

    if not relevant_headlines:
        print(f"\nTidak ada data yang mengandung keyword '{keyword}'.")
        return

    # Analisis sentimen dengan NLP (CUDA)
    sentiment_counts = {"Positif": 0, "Negatif": 0, "Netral": 0}
    examples = {"Positif": [], "Negatif": [], "Netral": []}
    source_counts = Counter([source for source, _, _ in relevant_headlines])
    category_counts = Counter([category for _, category, _ in relevant_headlines])

    print("Menganalisis sentimen dengan NLP (CUDA)...")
    texts = [text[:512] for _, _, text in relevant_headlines]  # Batch processing
    results = sentiment_analyzer(texts)

    for (source, category, text), result in zip(relevant_headlines, results):
        label = result["label"]
        score = result["score"]

        if label == "POSITIVE" and score > 0.6:
            sentiment = "Positif"
        elif label == "NEGATIVE" and score > 0.6:
            sentiment = "Negatif"
        else:
            sentiment = "Netral"

        sentiment_counts[sentiment] += 1
        if len(examples[sentiment]) < 3:
            examples[sentiment].append(f"[{source}] {text}")

    # Tampilkan hasil teks
    total_relevant = len(relevant_headlines)
    print(f"\nData yang mengandung '{keyword}' (Berita): {total_relevant} item")
    print("\nAnalisis Sentimen (dengan NLP):")
    for sentiment, count in sentiment_counts.items():
        percentage = count / total_relevant * 100 if total_relevant > 0 else 0
        print(f"{sentiment}: {count} item ({percentage:.1f}%)")
        print("Contoh:")
        for example in examples[sentiment]:
            print(f" - {example}")
        print()

    # Simpan ke CSV
    with open(f"data_{keyword}_news_nlp_cuda.csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Sumber", "Kategori", "Teks"])
        for source, category, text in relevant_headlines:
            writer.writerow([source, category, text])
    print(f"Data disimpan ke 'data_{keyword}_news_nlp_cuda.csv'")

    # Plotting beragam
    plt.style.use('seaborn')  # Gaya visual lebih menarik

    # 1. Distribusi Sentimen (Bar Chart)
    plt.figure(figsize=(8, 5))
    sentiments = list(sentiment_counts.keys())
    counts = list(sentiment_counts.values())
    bars = plt.bar(sentiments, counts, color=['green', 'red', 'blue'])
    plt.title(f"Distribusi Sentimen untuk '{keyword}'")
    plt.xlabel("Sentimen")
    plt.ylabel("Jumlah Item")
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 1, int(height), ha='center', va='bottom')
    plt.tight_layout()
    plt.show()

    # 2. Distribusi Data per Sumber (Pie Chart)
    plt.figure(figsize=(10, 10))
    sources = list(source_counts.keys())
    source_values = list(source_counts.values())
    plt.pie(source_values, labels=sources, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired(np.arange(len(sources))))
    plt.title(f"Proporsi Data per Sumber untuk '{keyword}'")
    plt.tight_layout()
    plt.show()

    # 3. Jumlah Data per Kategori (Bar Chart)
    plt.figure(figsize=(12, 6))
    categories = list(category_counts.keys())
    cat_counts = list(category_counts.values())
    bars = plt.bar(categories, cat_counts, color='skyblue')
    plt.title(f"Jumlah Data per Kategori untuk '{keyword}'")
    plt.xlabel("Kategori")
    plt.ylabel("Jumlah Item")
    plt.xticks(rotation=45, ha='right')
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 1, int(height), ha='center', va='bottom')
    plt.tight_layout()
    plt.show()

    # 4. Word Cloud
    all_text = " ".join([text for _, _, text in relevant_headlines]).lower()
    wordcloud = WordCloud(width=800, height=400, background_color='white', min_font_size=10).generate(all_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Word Cloud untuk '{keyword}'")
    plt.tight_layout()
    plt.show()

# Input keyword dari pengguna
keyword = input("Masukkan keyword yang ingin dicari (misalnya 'Pemerintah'): ").strip()

# Proses scraping berita
all_headlines = {}
for source_name, categories in news_sources.items():
    all_headlines[source_name] = {}
    for category, url in categories.items():
        try:
            all_headlines[source_name][category] = scrape_headlines(url, source_name, category)
        except Exception as e:
            print(f"Gagal mengambil {source_name} - {category}: {e}")

# Hitung total judul berita yang diambil
total_judul = sum(len(headlines) for source in all_headlines.values() for headlines in source.values())
print(f"\nTotal judul berita yang diambil: {total_judul}")

# Analisis dan plot
analyze_and_plot_data(all_headlines, keyword)

# Tutup browser
driver.quit()

CUDA tersedia. Menggunakan perangkat: NVIDIA GeForce GTX 1650 Ti


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Masukkan keyword yang ingin dicari (misalnya 'Pemerintah'):  Pemerintah


Memuat Detik - Terpopuler...
Ditemukan 20 judul dari Detik - Terpopuler
Memuat Detik - Otomotif...
Ditemukan 98 judul dari Detik - Otomotif
Memuat Detik - Politik...
Ditemukan 1 judul dari Detik - Politik
Memuat Detik - Ekonomi...
Ditemukan 107 judul dari Detik - Ekonomi
Memuat Detik - Olahraga...
Ditemukan 93 judul dari Detik - Olahraga
Memuat Detik - Teknologi...
Ditemukan 107 judul dari Detik - Teknologi
Memuat Detik - Hiburan...
Ditemukan 146 judul dari Detik - Hiburan
Memuat Kompas - Tren...
Ditemukan 3 judul dari Kompas - Tren
Memuat Kompas - Otomotif...
Ditemukan 3 judul dari Kompas - Otomotif
Memuat Kompas - Politik...
Ditemukan 10 judul dari Kompas - Politik
Memuat Kompas - Ekonomi...
