# 🧹 Web Scraping News: MU's defeat vs ASEAN All Stars

***

## 🛠️ Import Library

In [1]:
#!pip install selenium

In [2]:
#!pip install requests beautifulsoup4

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 📌 Scraping Function from Detik

In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_detik(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1').text.strip() if soup.find('h1') else 'Tidak ditemukan'
    date = soup.find('div', class_='detail__date').text.strip() if soup.find('div', class_='detail__date') else 'Tidak ditemukan'
    content_tag = soup.find('div', class_='detail__body-text')
    content = content_tag.get_text(separator='\n').strip() if content_tag else 'Tidak ditemukan'

    return {'title': title, 'date': date, 'content': content}

## 📌 Scraping Function From Kompas

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

def scrape_kompas(url, headless=True):
    options = Options()
    if headless:
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        time.sleep(3)  # Tambahkan waktu tunggu

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        title = soup.find('h1').text.strip() if soup.find('h1') else 'Tidak ditemukan'
        date = soup.find('div', class_='read__time').text.strip() if soup.find('div', class_='read__time') else 'Tidak ditemukan'
        content_tag = soup.find('div', class_='read__content')
        content = content_tag.get_text(separator='\n').strip() if content_tag else 'Tidak ditemukan'
    finally:
        driver.quit()

    return {'title': title, 'date': date, 'content': content}

## 📌 Scraping Function from Okezone

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

def scrape_okezone(url, headless=True):
    options = Options()
    if headless:
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        title = soup.find('h1').text.strip() if soup.find('h1') else 'Tidak ditemukan'
        date = soup.find('li', class_='date').text.strip() if soup.find('li', class_='date') else 'Tidak ditemukan'
        content_tag = soup.find('div', class_='news-content')
        content = content_tag.get_text(separator='\n').strip() if content_tag else 'Tidak ditemukan'
    finally:
        driver.quit()

    return {'title': title, 'date': date, 'content': content}

## 📌 Scraping Function from CNN Indonesia

In [7]:
import requests
from bs4 import BeautifulSoup

def scrape_cnnindonesia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1').text.strip() if soup.find('h1') else 'Tidak ditemukan'
    date = soup.find('div', class_='text-cnn_grey').text.strip() if soup.find('div', class_='text-cnn_grey') else 'Tidak ditemukan'
    content_tag = soup.find('div', class_='detail-text')
    content = content_tag.get_text(separator='\n').strip() if content_tag else 'Tidak ditemukan'

    return {'title': title, 'date': date, 'content': content}

## 📌 Scraping Function from Suara Kalbar

In [8]:
import requests
from bs4 import BeautifulSoup

def scrape_suarakalbar(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1').text.strip() if soup.find('h1') else 'Tidak ditemukan'
    date = soup.find('time').text.strip() if soup.find('time') else 'Tidak ditemukan'
    content_tag = soup.find('div', class_='td-post-content tagdiv-type')
    content = content_tag.get_text(separator='\n').strip() if content_tag else 'Tidak ditemukan'

    return {'title': title, 'date': date, 'content': content}

## 🧪 Test Scraping Function

In [9]:
urls = [
    "https://sport.detik.com/sepakbola/liga-inggris/d-7938117/asean-all-stars-vs-mu-setan-merah-kalah-0-1",
    "https://www.kompas.com/sports/read/2025/05/28/22233688/skor-asean-all-stars-vs-man-united-1-0-mu-menanggung-malu-di-bukit-jalil",
    "https://bola.okezone.com/read/2025/05/28/51/3142856/hasil-asean-all-stars-vs-manchester-united-setan-merah-kalah-0-1",
    "https://www.cnnindonesia.com/olahraga/20250528213751-142-1234366/asean-all-stars-hajar-manchester-united",
    "https://www.suarakalbar.co.id/2025/05/asean-all-star-kejutkan-manchester-united-1-0-dua-pemain-indonesia-turun-berlaga/"
]

articles = []

In [10]:
for url in urls:
    try:
        if 'kompas' in url:
            result = scrape_kompas(url)
        elif 'okezone' in url:
            result = scrape_okezone(url)
        elif 'cnnindonesia' in url:
            result = scrape_cnnindonesia(url)
        elif 'suarakalbar' in url:
            result = scrape_suarakalbar(url)
        elif 'detik' in url:
            result = scrape_detik(url)
        else:
            print(f"[!] URL tidak dikenali: {url}")
            continue

        result['source'] = url.split('/')[2]
        result['url'] = url
        articles.append(result)

    except Exception as e:
        print(f"[!] Gagal scrape {url}:\n", e)

In [11]:
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,title,date,content,source,url
0,ASEAN All Stars Vs MU: Setan Merah Kalah 0-1,"Rabu, 28 Mei 2025 21:43 WIB",Kuala Lumpur\n - \nManchester United\n memulai...,sport.detik.com,https://sport.detik.com/sepakbola/liga-inggris...
1,403 ERROR,Tidak ditemukan,Tidak ditemukan,www.kompas.com,https://www.kompas.com/sports/read/2025/05/28/...
2,Hasil ASEAN All Stars vs Manchester United: Se...,Tidak ditemukan,Tidak ditemukan,bola.okezone.com,https://bola.okezone.com/read/2025/05/28/51/31...
3,ASEAN All Stars Hajar Manchester United,"Rabu, 28 Mei 2025 21:43 WIB","Jakarta, CNN Indonesia\n -- \nASEAN All Stars ...",www.cnnindonesia.com,https://www.cnnindonesia.com/olahraga/20250528...
4,"ASEAN All-Star Kejutkan Manchester United 1-0,...",Tidak ditemukan,Tidak ditemukan,www.suarakalbar.co.id,https://www.suarakalbar.co.id/2025/05/asean-al...


## 💬 Scrap Comment Function from kompas

In [33]:
import requests
from bs4 import BeautifulSoup

def extract_comments_kompas(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    comments_data = []

    # Komentar utama
    for item in soup.select('div.commentItem[id^="commentItem-"]'):
        user_tag = item.select_one('div.commentUser')
        text_tag = item.select_one('div.commentContent-inner')

        if user_tag and text_tag:
            comments_data.append({
                'user': user_tag.get_text(strip=True),
                'komentar': text_tag.get_text(strip=True),
                'tipe': 'utama',
                'sumber': 'kompas'
            })

        # Komentar balasan (jika ada)
        reply_block = item.select_one('div[id^="commentReplyList-"]')
        if reply_block:
            for reply in reply_block.select('div.commentItem[id^="commentReplyItem-"]'):
                reply_user = reply.select_one('div.commentUser')
                reply_text = reply.select_one('div.commentContent-inner')

                if reply_user and reply_text:
                    comments_data.append({
                        'user': reply_user.get_text(strip=True),
                        'komentar': reply_text.get_text(strip=True),
                        'tipe': 'balasan',
                        'sumber': 'kompas'
                    })

    return comments_data

## 💬 Scrap Comment Function from detik

In [34]:
def extract_comments_detik(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    comments_data = []

    for item in soup.select('div.komentar-iframe-min-list-content__item'):
        user_tag = item.select_one('div.komentar-iframe-min-media__user')
        text_tag = item.select_one('div.komentar-iframe-min-media__desc')

        if user_tag and text_tag:
            comments_data.append({
                'user': user_tag.get_text(strip=True),
                'komentar': text_tag.get_text(strip=True),
                'tipe': 'utama',  # iframe tidak menampilkan reply secara eksplisit
                'sumber': 'detik'
            })

    return comments_data

## 📅 Merging into Dataframe

In [35]:
comm_urls = {
    'kompas': 'https://www.kompas.com/sports/read/2025/05/28/22233688/skor-asean-all-stars-vs-man-united-1-0-mu-menanggung-malu-di-bukit-jalil',
    'detik': 'https://sport.detik.com/sepakbola/liga-inggris/d-7938117/asean-all-stars-vs-mu-setan-merah-kalah-0-1'
}

kompas_comments = extract_comments_kompas(comm_urls['kompas'])
detik_comments = extract_comments_detik(comm_urls['detik'])

import pandas as pd
comment_df = pd.DataFrame(kompas_comments + detik_comments)
comment_df.head()