In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import tqdm
import json
import os
import re

In [None]:
type_dict = {
             'thoi-su': ['chinh-tri', 'dan-sinh', 'lao-dong-viec-lam', 'giao-thong', 'mekong', 'quy-hy-vong'],
             'goc-nhin': ['binh-luan-nhieu', 'chinh-tri-chinh-sach', 'y-te-suc-khoe', 'kinh-doanh-quan-tri', 'giao-duc-tri-thuc', 'moi-truong', 'van-hoa-doi-song', 'covid-19', 'tac-gia'],
             'the-gioi': ['tu-lieu', 'phan-tich', 'nguoi-viet-5-chau', 'cuoc-song-do-day', 'quan-su'],
             'kinh-doanh': ['net-zero', 'quoc-te', 'doanh-nghiep', 'chung-khoan', 'ebank', 'vi-mo', 'tien-cua-toi', 'hang-hoa'],
             'bat-dong-san': ['chinh-sach', 'thi-truong', 'khong-gian-song', 'tu-van'],
             'khoa-hoc': ['khoa-hoc-trong-nuoc', 'pii-doi-moi-sang-tao', 'tin-tuc', 'phat-minh', 'ung-dung', 'the-gioi-tu-nhien', 'thuong-thuc'],
             'giai-tri': ['gioi-sao', 'sach', 'video', 'phim', 'nhac', 'thoi-trang', 'lam-dep', 'san-khau-my-thuat'],
             'the-thao': ['bong-da', 'du-lieu-bong-da', 'marathon', 'tennis', 'cac-mon-khac', 'hau-truong'],
             'phap-luat': ['ho-so-pha-an', 'tu-van'],
             'giao-duc': ['tin-tuc', 'tuyen-sinh', 'chan-dung', 'du-hoc', 'thao-luan', 'hoc-tieng-anh', 'giao-duc-40'],
             'suc-khoe': ['tin-tuc', 'cac-benh', 'song-khoe', 'vaccine'],
             'doi-song': ['nhip-song', 'to-am', 'bai-hoc-song', 'cooking', 'tieu-dung'],
             'du-lich': ['diem-den', 'am-thuc', 'dau-chan', 'tu-van', 'cam-nang'],
             'so-hoa': ['cong-nghe', 'san-pham', 'blockchain'],
             'xe': ['thi-truong', 'dien-dan'],
             'y-kien': ['thoi-su', 'doi-song']
            }

In [None]:
os.makedirs("data", exist_ok=True)

In [None]:
vietnamese_days = {
    "Chủ nhật": "Sunday",
    "Thứ hai": "Monday",
    "Thứ ba": "Tuesday",
    "Thứ tư": "Wednesday",
    "Thứ năm": "Thursday",
    "Thứ sáu": "Friday",
    "Thứ bảy": "Saturday"
}


def convert_vietnamese_date(date_str):

    for vn_day, en_day in vietnamese_days.items():
        if vn_day in date_str:
            date_str = date_str.replace(vn_day, en_day)
            break

    date_str = re.sub(r"\s\(GMT[+-]\d{1,2}\)", "", date_str)

    return date_str

DESCRIPTION

In [None]:
crawl_url_list = []
f_urls = open("data/urls/surl_list.jsonl", "w", encoding="utf-8")

total_count = 0 
numlost = 0
max_numlost = 3

for type in tqdm.tqdm(type_dict, desc="Processing types"):
    for sub_type in tqdm.tqdm(type_dict[type], desc=f"Processing subtypes for {type}"):
        target_count = 33
        i = 1
        current_count = 0
        no_articles_in_a_row = 0
        
        while current_count < target_count:
            url = f"https://vnexpress.net/{type}/{sub_type}-p{i}"
            content = requests.get(url)

            if content.status_code != 200:
                print(f"{content.status_code} Error {type}|{sub_type}")
                break
            
            soup = BeautifulSoup(content.content, "html.parser")
            tmp_title_list = soup.find_all(class_="title-news")
            
            if len(tmp_title_list) == 0:
                no_articles_in_a_row += 1
                print(f"No article {type}|{sub_type}.")
                break
            
            else:
                no_articles_in_a_row = 0
                for title in tmp_title_list:
                    try:
                        article_info = title.find_all("a")                                     
                        if article_info:
                            article_url = article_info[0].get("href")
                            article_title = article_info[0].get("title")
                            if article_url and article_title:
                                description = title.find_next("p", class_="description") 
                                if description:
                                    article_page_content = requests.get(article_url).content
                                    article_page_soup = BeautifulSoup(article_page_content, "html.parser")
                                    article_date = article_page_soup.find("span", class_="date").text.strip() if article_page_soup.find("span", class_="date") else None
                                    if article_date:
                                        article_date = convert_vietnamese_date(article_date)
                                        article_date_obj = datetime.strptime(article_date, "%A, %d/%m/%Y, %H:%M")
                                        article_year = article_date_obj.year
                                        description_text = description.get_text(strip=True)
                                        if article_year >= 2020:
                                            article_content = ""
                                            for p_tag in article_page_soup.find_all("p", class_="Normal"):
                                                article_content += p_tag.get_text(" ", strip=True) + " " 
                                            words = article_content.split()
                                            word_num = len(words)
                                            if word_num > 0 and word_num < 1000:
                                                sample = {
                                                "url": article_url,
                                                "title": article_title,
                                                "description": description_text,
                                                "date": article_date,
                                                "type": f"{type}|{sub_type}",
                                                "words": word_num,
                                                "content": article_content,
                                                }                                             
                                                f_urls.write(json.dumps(sample, indent=4, ensure_ascii=False) + "\n")
                                                current_count += 1
                                                total_count += 1
                                                if current_count % 1 == 0:
                                                    print(f"Total: {total_count}")
                                                if current_count >= target_count:
                                                    break
                                                numlost = 0
                                                
                        else:
                            numlost += 1
                            print(f"Error {type}|{sub_type}. Continue...")
                            if numlost >= max_numlost:
                                print(f"Error {type}|{sub_type}. Exiting...")
                                break
                            
                    except Exception as e:
                        print(f"Error: {e}")
                        pass
            
            if numlost >= max_numlost:
                print(f"Error {type}|{sub_type}. Exiting....")
                break
            i += 1

        if numlost >= max_numlost:
            print(f"Error {type}|{sub_type}. Exiting.....")
            break

f_urls.close()
