In [4]:
# selenium import
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException

# other imports
import os
import json

In [5]:
chrome_options = webdriver.ChromeOptions()

#chrome_options.add_argument('--headless') # must options for Google Colab
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")


In [6]:
MAGAZINE_NAME = "vnexpress"
HOME_PAGE = "https://vnexpress.net/"

Go to home page

In [7]:
driver = webdriver.Chrome(options=chrome_options)
driver.get(HOME_PAGE)

SelectSelect menu buttons

In [8]:
## Chose element with class_name == "all-menu"
all_menu = driver.find_element(by=By.CLASS_NAME, value="all-menu")

## Click on menu
all_menu.click() 

Collect all category in Home Page

In [9]:
cats = []
row_menu = driver.find_element(by=By.CLASS_NAME, value="row-menu")
cat_menus = row_menu.find_elements(by=By.CLASS_NAME, value="cat-menu")

for cat_menu in cat_menus: # Loop through cat menus to main cat and corresponding links
    cat = cat_menu.find_element(by=By.TAG_NAME, value="a").get_attribute("title").strip()
    href = cat_menu.find_element(by=By.TAG_NAME, value="a").get_attribute("href").strip()
    cats.append({"cat_name": cat, "url": href})

In [10]:
cats, len(cats)

([{'cat_name': 'Thời sự', 'url': 'https://vnexpress.net/thoi-su'},
  {'cat_name': 'Thế giới', 'url': 'https://vnexpress.net/the-gioi'},
  {'cat_name': 'Kinh doanh', 'url': 'https://vnexpress.net/kinh-doanh'},
  {'cat_name': 'Công nghệ', 'url': 'https://vnexpress.net/cong-nghe'},
  {'cat_name': 'Khoa học', 'url': 'https://vnexpress.net/khoa-hoc'},
  {'cat_name': 'Video', 'url': 'https://video.vnexpress.net/'},
  {'cat_name': 'Podcasts', 'url': 'https://vnexpress.net/podcast'},
  {'cat_name': 'Góc nhìn', 'url': 'https://vnexpress.net/goc-nhin'},
  {'cat_name': 'Bất động sản', 'url': 'https://vnexpress.net/bat-dong-san'},
  {'cat_name': 'Sức khỏe', 'url': 'https://vnexpress.net/suc-khoe'},
  {'cat_name': 'Thể thao', 'url': 'https://vnexpress.net/the-thao'},
  {'cat_name': 'Giải trí', 'url': 'https://vnexpress.net/giai-tri'},
  {'cat_name': 'Pháp luật', 'url': 'https://vnexpress.net/phap-luat'},
  {'cat_name': 'Giáo dục', 'url': 'https://vnexpress.net/giao-duc'},
  {'cat_name': 'Đời sống',

In [11]:
# Close after use
driver.close()

### Collect some links based on each news category of that website: News urls


> **How to collect**

From the categories and corresponding links, we go to each category in turn and get the links of the articles in that section of that website.



#### Setting parameter

In [12]:
# Set the number of links to get in each report category
NUM_ARTICLES_PER_CAT = 30 

# Vnexpress data storage path
DATA_URL_FILE = "data/vnexpress_url.json"

# Some genres don't need
EXCLUDING_CATEGORIES = ["Video", "Podcasts", "Góc nhìn", "Tâm sự", "Thư giãn", "Ý kiến"]

# Additional settings chromedriver
## We set the load stategy here as normal: https://www.selenium.dev/documentation/webdriver/drivers/options/
chrome_options.page_load_strategy = "normal"

In [13]:
driver = webdriver.Chrome(options=chrome_options)

In [14]:
# Global variables for filtering deduplicating urls
crawled_urls = set()

def crawl_each_category_url(driver, category_url):
    """
    Functions cho lấy urls cho từng category sau khi thử nghiệm
    """
    all_urls = set()
    url = category_url

    # Limit the number of NUM_ARTICLES_PER_CAT

    while len(all_urls) < NUM_ARTICLES_PER_CAT:
        driver.get(url)
        title_news = driver.find_elements(by=By.CLASS_NAME, value="title-news")
        for title in title_news:
            try:
                url_new = title.find_element(by=By.TAG_NAME, value="a").get_attribute("href")
                if url_new.startswith(HOME_PAGE) and url_new not in crawled_urls: #avoid ads, different sites news
                    all_urls.add(url_new)
                    crawled_urls.add(url_new) # avoid dedup url

            # To see if there is bug
            except StaleElementReferenceException:
                continue
            except NoSuchElementException:
                print(f"NoSuchElementException at {url}")
                continue

        url = driver.find_element(by=By.CLASS_NAME, value="next-page").get_attribute("href")

    return all_urls

In [None]:
saved_cats = {}

# Collect for each category
for cat in cats:
    cat_name = cat["cat_name"]
    url = cat["url"]
    if cat_name not in EXCLUDING_CATEGORIES:
        print(f"You are at {cat}.")
        urls = crawl_each_category_url(driver, url)
        saved_cats[cat_name] = list(urls)

with open(DATA_URL_FILE, "w", encoding="utf-8") as fOut:
    json.dump(saved_cats, fOut, ensure_ascii=False, indent=4)

driver.close()

In [None]:
len(crawled_urls)

721

### Collect and process each article based on the link of the previous step: News articles

> **How ​​to collect**

From the link in the previous section, we go to each link in turn and collect information about the article.

#### Parameter settings

In [None]:
# Filepath for the previous one
FILE_URL_PATH = "data/vnexpress_url.json"

# Set limit of articles from each category
MAX_ARTICLES_PER_CAT = 2 # If set = None then all urls in the previous file

# Data output, each category is a json file containing articles
DATA_FOLDER_OUTPUT = "data/vnexpress"
#!mkdir -p $DATA_FOLDER_OUTPUT

# To load strategy about eager loading quickly, not caring about images
chrome_options.page_load_strategy = "eager"

In [None]:
# Read url file
with open(FILE_URL_PATH, "r", encoding="utf-8") as fIn:
    url_data = json.load(fIn)

len(url_data)

14

In [None]:
def get_content_metadata(driver, article_url):

    """
    Extracts and returns metadata and content from a given article URL.

    :param driver: Selenium WebDriver instance.
    :param article_url: URL of the article to extract data from.
    :return: Dictionary containing article metadata and content.
    """

    # Get to current article
    driver.get(article_url)

    # Collect title
    title = driver.find_element(by=By.CSS_SELECTOR, value="h1.title-detail").text.strip()

    # Collect description
    description = driver.find_element(by=By.CLASS_NAME, value="description").text.strip()

    # Collect thể loại
    lis_cat = driver.find_element(by=By.CSS_SELECTOR, value="ul.breadcrumb").find_elements(by=By.TAG_NAME, value="li")
    main_cat = lis_cat[0].text if len(lis_cat) > 0 else None
    sub_cat = lis_cat[1].text if len(lis_cat) > 1 else None

    # Collect published date
    publish_date = driver.find_element(by=By.CSS_SELECTOR, value='[itemprop="datePublished"]').get_attribute("content").strip()

    # Collect content bài báo
    # Locate write content
    article = driver.find_element(by=By.CSS_SELECTOR, value="article.fck_detail")
    # Get all subheadings of the article
    children = article.find_elements(by=By.XPATH, value="./*")

    contents = []
    author = "Unknown"

    # Check is it a slide show
    is_slide_show = False
    for idx, child in enumerate(children):
        text = child.text.strip()
        # If element right align --> can be author
        if child.tag_name == "p" and ("right" in child.get_attribute("align") or "right" in child.get_attribute("style")) and idx >= len(children) - 3: # last three, align right --> author
            author = text
        elif child.tag_name == "p" and child.get_attribute("class") == "Normal": # paragraph
            # If center
            if len(text):
                if ("center" in child.get_attribute("align") or "center" in child.get_attribute("style")):
                    contents.append(f"[{text}]")
                else:
                    contents.append(text)

        # Only get the figure caption
        elif child.tag_name == "figure" :
            ## If length > 100  --> not a caption, it's next description
            if len(text):
                if len(text) <= 100: # if len <= 100 --> add [] around
                    contents.append(f"[{text}]")
                else:
                    contents.append(text)

        # If it's a slide show, it looks like a figure.
        elif child.tag_name == "div" and "item_slide_show" in child.get_attribute("class"):
            is_slide_show = True # slideshow
            if len(text):
                if len(text) <= 100:
                    contents.append(f"[{text}]")
                else:
                    contents.append(text)

        # pass table 
        elif child.tag_name == "table": # Do nothing rightnow
            pass

    if is_slide_show:
        author = text

    # If you still don't see the author, search by tag.
    if author == "Unknown":
        try:
            author = driver.find_element(by=By.XPATH, value="//*[contains(@class, 'author')]").text
        except:
            pass

    return {
        "url": article_url,
        "title": title,
        "description": description,
        "content": "\n".join(contents), # join paragraphs with \n
        "metadata": {
            "cat": main_cat,
            "subcat": sub_cat,
            "published_date": publish_date,
            "author": author
        }
    }


In [None]:
driver = webdriver.Chrome(options=chrome_options)

os.makedirs(DATA_FOLDER_OUTPUT, exist_ok=True)

driver = webdriver.Chrome(options=chrome_options)

for cat, urls in url_data.items():
    print(f"Collect category data {cat} ..")
    count_crawled = 0
    cat_data = []
    for url in urls:
        try:
            cat_data.append(get_content_metadata(driver, url))
            count_crawled += 1
            if MAX_ARTICLES_PER_CAT and count_crawled >= MAX_ARTICLES_PER_CAT:
                break

        except (StaleElementReferenceException, NoSuchElementException) as e:
            print(f"Bug at url: {url}, with ElementException")
            driver.refresh()
            continue
        
    # File name processing: replace special characters
    name_file_cat = cat.lower().replace(" ", "-")
    name_file_cat = ''.join(c for c in name_file_cat if c.isalnum() or c in ('-', '_')) + ".json"

    with open(os.path.join(DATA_FOLDER_OUTPUT, name_file_cat), "w", encoding="utf-8") as fOut:
        json.dump(cat_data, fOut, ensure_ascii=False, indent=4)

driver.close()

Thu thập dữ liệu thể loại Thời sự ..
Thu thập dữ liệu thể loại Thế giới ..
Thu thập dữ liệu thể loại Kinh doanh ..
Bug at url: https://vnexpress.net/topic/con-loc-temu-27788, with ElementException
Thu thập dữ liệu thể loại Bất động sản ..
Thu thập dữ liệu thể loại Khoa học ..
Thu thập dữ liệu thể loại Giải trí ..
Thu thập dữ liệu thể loại Thể thao ..
Thu thập dữ liệu thể loại Pháp luật ..
Thu thập dữ liệu thể loại Giáo dục ..
Thu thập dữ liệu thể loại Sức khỏe ..
Thu thập dữ liệu thể loại Đời sống ..
Thu thập dữ liệu thể loại Du lịch ..
Thu thập dữ liệu thể loại Số hóa ..
Thu thập dữ liệu thể loại Xe ..


In [None]:
# Sample
cat_data[0]

{'url': 'https://vnexpress.net/cach-phanh-xe-em-nhu-tai-xe-limousine-4773457.html',
 'title': 'Cách phanh xe êm như tài xế limousine',
 'description': '"Limo Stop" là kỹ năng phanh xe êm như tài xế limousine, bằng cách nhả nhẹ chân phanh khi xe gần dừng hẳn, giúp người trên xe không chúi về trước.',
 'content': '',
 'metadata': {'cat': 'Xe',
  'subcat': 'Cầm lái',
  'published_date': '2024-07-24T04:00:00+07:00',
  'author': 'Hồ Tân'}}