## Data Acquisition ##

In [1]:
!pip install feedparser pandas datetime urllib3 sastrawi newspaper3k lxml_html_clean requests_html feedparser requests beautifulsoup4 newspaper3k openpyxl



Scrape Link Berita

Sheets berisi Link asli berita (bukan redirect dari google news) : https://docs.google.com/spreadsheets/d/1y-0uxDuZfzFd6bABieboryQ-nXHvE7Xq1WI0gMWurJM/edit?usp=sharing

In [2]:
import feedparser
import pandas as pd
from datetime import datetime
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from urllib.parse import urlparse

def scrape_google_news(keyword):
    # Encode keyword biar aman di URL
    encoded_keyword = quote(keyword)
    rss_url = f"https://news.google.com/rss/search?q={encoded_keyword}&hl=id&gl=ID&ceid=ID:id"

    feed = feedparser.parse(rss_url)

    data = []
    for entry in feed.entries:
        data.append({
            "title": entry.title,
            "link": entry.link,
            "published": entry.published if "published" in entry else None,
            "source": entry.source.title if "source" in entry else None
        })

    return data

if __name__ == "__main__":
    keyword = '"Mitra Darat"'   # gunakan frasa dengan kutip
    news_data = scrape_google_news(keyword)

    df = pd.DataFrame(news_data)
    print(df)

    if not df.empty:
        filename = f"news_mitradarat_{datetime.now().strftime('%Y%m%d')}.csv"
        df.to_csv(filename, index=False, encoding="utf-8-sig")
        print(f"Data berhasil disimpan di {filename}")
    else:
        print("Tidak ada berita ditemukan.")


                                                title  \
0   Cara Daftar Mudik Gratis Kemenhub 2025 melalui...   
1   Cara Mendaftar Mudik Gratis dari Kemenhub 2025...   
2   Ingin Ikut Mudik Gratis 2025? Ini Syarat dan C...   
3   Pengguna bisa cek pergerakan Trans Metro Dewat...   
4   Syarat dan Cara Daftar Mudik Gratis 2025 Kemen...   
..                                                ...   
95  Cara Mudik Gratis Pakai Bus, Motor Diangkut Pa...   
96  Penambahan Kuota Mudik Gratis dengan Bus Lebar...   
97  Cara Daftar Mudik Gratis Lewat Aplikasi MitraD...   
98  Cara Dapat Tiket Mudik Gratis 2023 Kemenhub di...   
99  Daftar Program Mudik Gratis, Persyaratan dan L...   

                                                 link  \
0   https://news.google.com/rss/articles/CBMitwFBV...   
1   https://news.google.com/rss/articles/CBMigwFBV...   
2   https://news.google.com/rss/articles/CBMidkFVX...   
3   https://news.google.com/rss/articles/CBMipwFBV...   
4   https://news.google.com/rs

Scrapping isi berita

In [3]:
def scrape_article(url):
    try:
        # Gunakan newspaper3k lebih dulu
        article = Article(url, language="id")
        article.download()
        article.parse()
        return article.title, article.text
    except:
        try:
            # Fallback pakai BeautifulSoup
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")

            # Judul
            title = soup.find("title").get_text(strip=True) if soup.find("title") else None

            # Konten
            paragraphs = soup.find_all("p")
            text = " ".join([p.get_text() for p in paragraphs])
            return title, text.strip()
        except:
            return None, None

if __name__ == "__main__":
    # Baca file Excel (input link berita)
    df = pd.read_excel("Kelompok 3 - Link Berita MitraDarat.xlsx")

    required_cols = ["PIC", "URL", "URL [Direct]", "Published"]
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"File harus memiliki kolom: {required_cols}")

    titles, contents, sources = [], [], []

    for link in df["URL [Direct]"]:
        title, content = scrape_article(link)
        titles.append(title)
        contents.append(content)

        try:
            domain = urlparse(link).netloc
        except:
            domain = None
        sources.append(domain)

    # Buat dataframe hasil scraping
    scraped_df = pd.DataFrame({
        "title": titles,
        "link": df["URL [Direct]"],
        "published": df["Published"],
        "source": sources,
        "content_raw": contents
    })

    # Simpan hasil scraping (tanpa preprocessing dulu)
    output_file = "news_mitradarat_scraped.csv"
    scraped_df.to_csv(output_file, index=False, encoding="utf-8-sig")

    print(f"✅ Data hasil scraping disimpan ke {output_file} (total: {len(scraped_df)})")

✅ Data hasil scraping disimpan ke news_mitradarat_scraped.csv (total: 174)


## Data Preprocessing

In [None]:
from bs4 import BeautifulSoup
from newspaper import Article
import nltk
import re
from urllib.parse import urlparse
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


# Download resource NLTK (sekali saja)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab")

# ============== PREPROCESSING ==============
def preprocess_text(text):
    if not text or not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Hapus karakter non-alfabet
    text = re.sub(r"[^a-zA-Z\s]", " ", text)

    # Tokenizing
    tokens = nltk.word_tokenize(text)

    # Stopwords removal (Indonesian + English)
    stop_words = set(stopwords.words("indonesian") + stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Stemming (Sastrawi untuk bahasa Indonesia)
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

# ============== MAIN ==============
if __name__ == "__main__":
    # Baca hasil scraping
    df = pd.read_csv("news_mitradarat_scraped.csv")

    # Preprocessing isi berita
    df["content_clean"] = df["content_raw"].apply(preprocess_text)

    # Simpan hasil akhir
    output_file = "news_mitradarat_preprocessed.csv"
    df.to_csv(output_file, index=False, encoding="utf-8-sig")

    print(f"✅ Data hasil preprocessing disimpan ke {output_file} (total: {len(df)})")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
df["content_clean"]