<a href="https://colab.research.google.com/github/XavierNgowKarYuen/Data-Engineering-Assignment-RDSY2S2/blob/xavier/Data_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import requests
from bs4 import BeautifulSoup

# Class to hold scraped data
class ScrapedData:
    def __init__(self, aid, title, date, publisher, views, comments_count, content):
        self.aid = aid
        self.title = title
        self.date = date
        self.publisher = publisher
        self.views = views
        self.comments_count = comments_count
        self.content = content

    # String representation of the object
    def __str__(self):
        return f"""
        Aid: {self.aid}
        Title: {self.title}
        Date: {self.date}
        Publisher: {self.publisher}
        Views: {self.views}
        Comments Count: {self.comments_count}
        Content:
        {self.content[:200]}...  # Truncated for readability
        """

# Function to scrape a single article
def scrape_article(url, aid):
    response = requests.get(url, timeout=10)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch URL: {url} - Status Code: {response.status_code}")

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract title
    title = soup.find('title').text.strip() if soup.find('title') else "Unknown Title"

    # Extract date
    date_tag = soup.find('p', class_='xg1')
    date = date_tag.text.split('|')[0].strip() if date_tag else "Unknown Date"

    # Extract publisher
    publisher_tag = date_tag.find('a') if date_tag else None
    publisher = publisher_tag.text.strip() if publisher_tag else "Unknown Publisher"

    # Extract views
    views_tag = soup.find('em', id='_viewnum')
    views = views_tag.text.strip() if views_tag else "0"

    # Extract comments count
    comments_tag = soup.find('em', id='_commentnum')
    comments_count = comments_tag.text.strip() if comments_tag else "0"

    # Extract content
    content_tag = soup.find('td', id='article_content')
    content = content_tag.get_text(strip=True) if content_tag else "No Content Available"

    return ScrapedData(aid, title, date, publisher, views, comments_count, content)

# Main function to scraping process
def main():
    base_url = "https://b.cari.com.my/portal.php?mod=view&aid="

    # List of AID values to scrape
    aid_values = list(range(1, 6))
    for aid in aid_values:
        url = f"{base_url}{aid}"

        try:
            scraped_data = scrape_article(url, aid)
            print(scraped_data)  # Print data
            print("\n------------------------\n")
        except Exception as e:
            print(f"Error scraping {url}: {e}")
if __name__ == "__main__":
    main()



        Aid: 1
        Title: Laporan polis terhadap Uncle Seekers didakwa hina Sultan Johor - CariDotMy
        Date: 19-7-2012 05:16 PM
        Publisher: cmcadmin
        Views: 243700
        Comments Count: 1917
        Content: 
        Post Last Edit by the_killer at 1-7-2012 17:14JOHOR BAHARU 1 Julai - Tiga puluh individu hari ini membuat laporan polis terhadap pengamal paranormal Syed Abdullah Hussein Al-Attas atau lebih dikenali ...  # Truncated for readability
        

------------------------


        Aid: 2
        Title: K'jaan pertahan cuti peribadi Najib di Milan - CariDotMy
        Date: 20-7-2012 10:10 AM
        Publisher: cmcadmin
        Views: 11114
        Comments Count: 158
        Content: 
        Selepas didesak oleh anggota parlimen pembangkang, Jabatan Perdana Menteri (JPM) mengakui Perdana Menteri Datuk Seri Najib Razak bercuti peribadi di Milan menggunakan pesawat rasmi.Bagaimanapun, menur...  # Truncated for readability
        

--------------------