In [1]:
import os
import csv
import pandas as pd
from enum import Enum
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.remote.webelement import WebElement

In [2]:
class FileHandler():
    @staticmethod
    def is_file_empty(file_name: str) -> bool:
        """Return True if file is empty

        Args: 
            - file_name: file's name that is needed to be check
        """
        try:
            return os.stat(file_name).st_size == 0
        except(FileNotFoundError):
            return True


    @staticmethod
    def write_to_csv(rows: list, file_name: str) -> None:
        header = ['Title', 'Content', 'Date', 'Url', 'Summary']
        data = pd.DataFrame(rows, index=None, columns=header)

        if FileHandler.is_file_empty(file_name):
            # remove unname cols
            data.drop(data.filter(regex="Unname"), axis=1, inplace=True)
            # drop duplicate rows
            data.drop_duplicates(inplace=True)
            data.to_csv(file_name, index=False)
        else:
            existed_data = pd.read_csv(file_name)
            # outer join
            merged_data = pd.merge(data, existed_data, how='outer').sort_values(by='Date', axis=0, ascending=False)
            # remove unname cols
            merged_data.drop(merged_data.filter(regex="Unname"), axis=1, inplace=True)
            # drop duplicate rows
            merged_data.drop_duplicates(inplace=True)
            merged_data.to_csv(file_name, index=False)

In [3]:
class BrowserOption(Enum):
    """Option for webbrowser
    """
    EDGE = 1
    CHROME = 2
    FIREFOX = 3
    SAFARI = 4

class TuoiTre_Crawler:
    @staticmethod
    def get_driver(browser_option: BrowserOption = BrowserOption.EDGE):
        """Return driver depended on BrowserOption Enuml
        
        Args:
            - browser_option: the option of browser's driver
        """
        if browser_option == BrowserOption.EDGE:
            options = webdriver.EdgeOptions()
            options.add_argument("--blink-settings=imagesEnabled=false")
            options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
            return webdriver.ChromiumEdge(options=options)
        elif browser_option == BrowserOption.FIREFOX:
            return webdriver.Firefox()
        elif browser_option == BrowserOption.SAFARI:
            return webdriver.Safari()       
        else:
            options = webdriver.ChromeOptions()
            options.add_argument("--blink-settings=imagesEnabled=false")
            options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
            return webdriver.Chrome(options=options)  
        
    def __init__(self, browser_option: BrowserOption, folder_to_save: str, category_url: dict) -> None:
        """Create a new instance of crawler
        
        Args:
            - browser_option: the option of browser's driver
            - folder_to_save: folder to save data crawled
            - category_url: a dictionary which key=filename, value=category url
        """
        self.driver = TuoiTre_Crawler.get_driver(browser_option)
        # self.url_file = url_file
        self.folder_to_save = folder_to_save
        self.category_url = category_url

    def crawl(self, url: str):
        """Crawl data from single url
        
        Args:
            - url: a news url
        """
        self.driver.get(url)

        title_selector = "#main-detail > .article-title"
        contents_selector = "div.detail-content.afcbc-body > :not(.VCSortableInPreviewMode, #InreadPc)"
        date_selector = "#main-detail > div.detail-top > div.detail-time"
        summary_selector = "#main-detail > .detail-sapo"
        
        title = self.driver.find_element(By.CSS_SELECTOR, title_selector)
        contents = self.driver.find_elements(By.CSS_SELECTOR, contents_selector)
        date = self.driver.find_element(By.CSS_SELECTOR, date_selector)
        summary = self.driver.find_element(By.CSS_SELECTOR, summary_selector)

        title = title.text.split('\n')[0]
        joined_content = " ".join(x.text for x in contents)
        return (title, joined_content, date.text, summary.text)

    def news_from_category(self, filename: str, url: str):
        """Get news URL from category page
        
        Args:
            - filename: file to save data
            - url: a category page url
        """
        self.driver.get(url)

        focus_main_selector = "div.list__focus-main a.box-category-link-title"
        focus_main = self.driver.find_elements(By.CSS_SELECTOR, focus_main_selector)

        listing_main_selector = "div.list__listing-main a.box-category-link-title"
        listing_main = self.driver.find_elements(By.CSS_SELECTOR, listing_main_selector)

        news_urls = [url.get_property('href') for url in focus_main]
        news_urls.extend([url.get_property('href') for url in listing_main])
        
        rows = []
        for news_url in news_urls:
            try:
                (title, joined_content, date, summary) = self.crawl(news_url)
                rows.append([title, joined_content, date, news_url, summary])
            except:
                continue
        
        FileHandler.write_to_csv(rows, os.path.join(self.folder_to_save, filename))
        print("Crawled", len(rows), "from", url)
        return len(rows)

    def start_crawl(self):
        count = 0
        for file, url in self.category_url.items():
            count += self.news_from_category(file, url)

        print("Done! Crawled", count)
        self.driver.quit()

    def data_summary(self, verbose=False):
        header = ['Title', 'Content', 'Date', 'Url', 'Summary']
        total = pd.DataFrame(columns=header)
        for file, url in self.category_url.items():
            data = pd.read_csv(os.path.join(self.folder_to_save, file))
            total = pd.merge(data, total, how='outer')
            print('Data:', file)
            print(data.info(verbose=verbose))
            print('=====================================')
        print('Total:')
        print(total.info(verbose=verbose))

In [4]:
category_url = {
    "tuoitre_kinhdoanh.csv": "https://tuoitre.vn/kinh-doanh.htm",
    "tuoitre_congnghe.csv": "https://tuoitre.vn/cong-nghe.htm",
    "tuoitre_dulich.csv": "https://tuoitre.vn/du-lich.htm",
    "tuoitre_vanhoa.csv": "https://tuoitre.vn/van-hoa.htm",
    "tuoitre_giaitri.csv": "https://tuoitre.vn/giai-tri.htm",
    "tuoitre_thethao.csv": "https://tuoitre.vn/the-thao.htm",
    "tuoitre_giaoduc.csv": "https://tuoitre.vn/giao-duc.htm"
}

crawler = TuoiTre_Crawler(BrowserOption.EDGE, folder_to_save='./dataset/05-27', category_url=category_url)
crawler.start_crawl()

Crawled 27 from https://tuoitre.vn/kinh-doanh.htm
Crawled 29 from https://tuoitre.vn/cong-nghe.htm
Crawled 17 from https://tuoitre.vn/du-lich.htm
Crawled 22 from https://tuoitre.vn/van-hoa.htm
Crawled 29 from https://tuoitre.vn/giai-tri.htm
Crawled 28 from https://tuoitre.vn/the-thao.htm
Crawled 28 from https://tuoitre.vn/giao-duc.htm
Done! Crawled 180


In [5]:
crawler.data_summary(verbose=True)

Data: tuoitre_kinhdoanh.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Title    82 non-null     object
 1   Content  82 non-null     object
 2   Date     82 non-null     object
 3   Url      82 non-null     object
 4   Summary  82 non-null     object
dtypes: object(5)
memory usage: 3.3+ KB
None
Data: tuoitre_congnghe.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Title    46 non-null     object
 1   Content  46 non-null     object
 2   Date     46 non-null     object
 3   Url      46 non-null     object
 4   Summary  46 non-null     object
dtypes: object(5)
memory usage: 1.9+ KB
None
Data: tuoitre_dulich.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 5 columns):
 #   C

### test

In [6]:
# def crawl(url: str) -> tuple[list[WebElement], list[WebElement]]:
#     """Crawl data from single url
    
#     Args:
#         - url: a news url
#     """
#     options = webdriver.EdgeOptions()
#     options.add_argument("--blink-settings=imagesEnabled=false")
#     options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
#     driver = webdriver.ChromiumEdge(options=options)

#     driver.get(url)

#     title_selector = "#main-detail > .article-title"
#     title = driver.find_element(By.CSS_SELECTOR, title_selector)
#     print(title.text.split('\n')[0])

#     date_selector = "#main-detail > div.detail-top > div.detail-time"
#     date = driver.find_element(By.CSS_SELECTOR, date_selector)
#     print(date.text)

#     summary_selector = "#main-detail > .detail-sapo"
#     summary = driver.find_element(By.CSS_SELECTOR, summary_selector)
#     print(summary.text)

#     content_selector = "div.detail-content.afcbc-body > :not(.VCSortableInPreviewMode, #InreadPc)"
#     content = driver.find_elements(By.CSS_SELECTOR, content_selector)
#     joined_content = " ".join(x.text+'\n' for x in content)
#     print("[", joined_content, "]")

# crawl('https://tuoitre.vn/thai-lan-dan-dau-sang-kien-visa-chung-o-dong-nam-a-20240413224000997.htm')

In [7]:
# def news_from_category(url: str):
#     """Get news URL from category page
    
#     Args:
#         - url: a category page url
#     """
#     options = webdriver.EdgeOptions()
#     options.add_argument("--blink-settings=imagesEnabled=false")
#     options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
#     driver = webdriver.ChromiumEdge(options=options)

#     driver.get(url)

#     focus_main_selector = "div.list__focus-main a.box-category-link-title"
#     focus_main = driver.find_elements(By.CSS_SELECTOR, focus_main_selector)
#     print("focus_main", len(focus_main))
#     # for e in focus_main:
#     #     print(e.get_property('href'))

#     listing_main_selector = "div.list__listing-main a.box-category-link-title"
#     listing_main = driver.find_elements(By.CSS_SELECTOR, listing_main_selector)
#     print("listing_main", len(listing_main))
#     # for e in listing_main:
#     #     print(e.get_property('href'))
    
#     news_url = [url.get_property('href') for url in focus_main]
#     news_url.extend([url.get_property('href') for url in listing_main])
#     print(len(news_url))
#     for e in news_url:
#         print(e)

# news_from_category('https://tuoitre.vn/the-gioi.htm')

In [8]:
# import pandas as pd

# def write_to_csv(rows: list, file_name):
#     header = ['Title', 'Content', 'Date', 'Url', 'Summary']
#     data = pd.DataFrame(rows, index=None, columns=header)

#     if FileHandler.is_file_empty(file_name):
#         data.to_csv(file_name)
#     else:
#         existed_data = pd.read_csv(file_name)
#         merged_data = pd.merge(data, existed_data, how='outer').sort_values(by='Date', axis=0, ascending=False)
#         merged_data.to_csv(file_name, index=False)