In [1]:
import json
import requests
from bs4 import BeautifulSoup
import os
import time
from random import choice
from typing import List, Optional
import pandas as pd 
from concurrent.futures import ThreadPoolExecutor

In [2]:
def clean_text(text):
    return text.strip() if text else ""

def remove_link_tag(links):
    return [link for link in links if link]

In [3]:
def get_type_of_law(url):
    text = url.split("/")[4]
    return text.split("?")[0]

In [4]:
PROJECT_ROOT = "/kaggle/working/"
PROXY_LIST_PATH = os.path.join(PROJECT_ROOT, 'config', 'proxy_list.txt')

def create_proxy_list():
    PROXY_URL = 'https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all'
    proxy_list = requests.get(PROXY_URL)
    os.makedirs(os.path.dirname(PROXY_LIST_PATH), exist_ok=True)
    with open(PROXY_LIST_PATH, 'w') as f:
        f.write(proxy_list.text.replace('\r\n', '\n'))

def choice_proxy() -> Optional[str]:
    if not os.path.exists(PROXY_LIST_PATH):
        create_proxy_list()
    
    with open(PROXY_LIST_PATH, 'r') as f:
        proxies = f.read().split('\n')
        return choice(proxies)

In [5]:
def load_url(url: str, return_content: bool = False) -> Optional[str]:
    """Load URL content with error handling."""
    proxy = choice_proxy()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, proxies={'http': proxy}, timeout=10)
        response.raise_for_status()
        if not return_content:
            return response
        else:
            soup = BeautifulSoup(response.content, 'html.parser')
            return soup
    except Exception as e:
        # Ghi log lỗi ra file hoặc danh sách
        with open("failed_links.txt", "a") as log_file:
            log_file.write(f"{url}\t{str(e)}\n")
        print(f"Failed to load {url}: {e}")
        return None

In [6]:
def get_all_sub_qa_url(url: str) -> List[str]:
    """Get all sub-URLs from the main QA URL."""
    soup = load_url(url, return_content=True)
    # Extract article links
    articles = soup.select('article')
    keywords = []
    time = []
    date = []
    links = []
    type_qa = []

    type_url = get_type_of_law(url)
    for article in articles:
        tag_a = article.find('a', class_='title-link')
        links.append("https://thuvienphapluat.vn" + tag_a.get("href") if tag_a else None)
        type_qa.append(type_url)
        # Keyword
        keyword_find = article.select('.d-block.sub-item-head-keyword')
        if keyword_find:
            keywords.append([kw.get_text(strip=True) for kw in keyword_find])
        else:
            keywords.append([None])
            
         # Get sub-time
        sub_time_tag = article.find('span', class_='sub-time')
        if sub_time_tag:
            time.append(sub_time_tag.get_text(strip=True).replace(" ","").split("|")[0])
            date.append(sub_time_tag.get_text(strip=True).replace(" ","").split("|")[1])
        else:
            time.append(None)
            date.append(None)
        
    df = pd.DataFrame(list(zip(links, keywords, date, time, type_qa)), columns =['link', 'keyword', "date", "time", "type"])
    return df

In [7]:
base_url = [ 
    "https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page={}",
    "https://thuvienphapluat.vn/phap-luat/doanh-nghiep?page={}",
    "https://thuvienphapluat.vn/phap-luat/lao-dong-tien-luong?page={}",
    "https://thuvienphapluat.vn/phap-luat/bat-dong-san?page={}",
    "https://thuvienphapluat.vn/phap-luat/vi-pham-hanh-chinh?page={}",
    "https://thuvienphapluat.vn/phap-luat/bao-hiem?page={}",
    "https://thuvienphapluat.vn/phap-luat/quyen-dan-su?page={}",
    "https://thuvienphapluat.vn/phap-luat/van-hoa-xa-hoi?page={}",
    "https://thuvienphapluat.vn/phap-luat/thuong-mai?page={}",
    "https://thuvienphapluat.vn/phap-luat/trach-nhiem-hinh-su?page={}",
    "https://thuvienphapluat.vn/phap-luat/xay-dung-do-thi?page={}",
    "https://thuvienphapluat.vn/phap-luat/chung-khoan?page={}",
    "https://thuvienphapluat.vn/phap-luat/ke-toan-kiem-toan?page={}",
    "https://thuvienphapluat.vn/phap-luat/thue-phi-le-phi?page={}",
    "https://thuvienphapluat.vn/phap-luat/xuat-nhap-khau?page={}",
    "https://thuvienphapluat.vn/phap-luat/tien-te-ngan-hang?page={}",
    "https://thuvienphapluat.vn/phap-luat/dau-tu?page={}",
    "https://thuvienphapluat.vn/phap-luat/so-huu-tri-tue?page={}",
    "https://thuvienphapluat.vn/phap-luat/thu-tuc-to-tung?page={}",
    "https://thuvienphapluat.vn/phap-luat/tai-chinh-nha-nuoc?page={}",
    "https://thuvienphapluat.vn/phap-luat/the-thao-y-te?page={}",
    "https://thuvienphapluat.vn/phap-luat/dich-vu-phap-ly?page={}",
    "https://thuvienphapluat.vn/phap-luat/tai-nguyen-moi-truong?page={}",
    "https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page={}",
    "https://thuvienphapluat.vn/phap-luat/giao-duc?page={}",
    "https://thuvienphapluat.vn/phap-luat/giao-thong-van-tai?page={}",
    "https://thuvienphapluat.vn/phap-luat/hanh-chinh?page={}",
    "https://thuvienphapluat.vn/phap-luat/linh-vuc-khac?page={}"
    ]

num_page = 500
num_page_urls = [url.format(i) for url in base_url for i in range(1, num_page)]

In [8]:
def process_url(url: str):
    try:
        df = get_all_sub_qa_url(url)
        return df
    except Exception as e:
        with open("failed_links.txt", "a") as log_file:
            log_file.write(f"{url}\t{str(e)}\n")
        print(f"Error processing {url}: {e}")
        return None

In [9]:
%%time 
def process_urls_multithreaded(urls, max_workers=10):
    dfs = []
    failed_urls = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(process_url, urls)
        for url, result in zip(urls, results):
            if result is not None:
                dfs.append(result)
            else:
                failed_urls.append(url)
    
    if failed_urls:
        with open("failed_links_summary.txt", "w") as f:
            for url in failed_urls:
                f.write(url + "\n")
    
    if dfs:
        final_df = pd.concat(dfs, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()  

df = process_urls_multithreaded(num_page_urls, max_workers=1024)

Failed to load https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page=304: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error processing https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page=304: 'NoneType' object has no attribute 'select'
Failed to load https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page=332: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to load https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page=341: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to load https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page=330: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error processing https://thuvienphapluat.vn/phap-luat/cong-nghe-thong-tin?page=332: 'NoneType' object has no attribute 'select'Error processing https://thuvienphapluat.vn/p

In [10]:
df

Unnamed: 0,link,keyword,date,time,type
0,https://thuvienphapluat.vn/phap-luat/khong-gia...,[Bảo vệ trẻ em],11/01/2025,02:40,cong-nghe-thong-tin
1,https://thuvienphapluat.vn/phap-luat/mua-bao-h...,[Bảo hiểm xe máy],10/01/2025,14:04,cong-nghe-thong-tin
2,,[None],,,cong-nghe-thong-tin
3,https://thuvienphapluat.vn/phap-luat/ho-tro-ph...,[Dịch vụ viễn thông],10/01/2025,12:25,cong-nghe-thong-tin
4,https://thuvienphapluat.vn/phap-luat/ho-tro-ph...,[Giấy phép cung cấp dịch vụ trò chơi điện tử G1],10/01/2025,11:32,cong-nghe-thong-tin
...,...,...,...,...,...
191222,https://thuvienphapluat.vn/phap-luat/co-so-san...,"[Trồng trọt, Phân bón]",06/03/2022,17:42,linh-vuc-khac
191223,,[None],,,linh-vuc-khac
191224,https://thuvienphapluat.vn/phap-luat/giong-cay...,"[Trồng trọt, Giống cây trồng]",06/03/2022,17:39,linh-vuc-khac
191225,https://thuvienphapluat.vn/phap-luat/trang-thi...,"[Trồng trọt, Canh tác]",06/03/2022,17:26,linh-vuc-khac


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191227 entries, 0 to 191226
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   link     173011 non-null  object
 1   keyword  191227 non-null  object
 2   date     173011 non-null  object
 3   time     173011 non-null  object
 4   type     191227 non-null  object
dtypes: object(5)
memory usage: 7.3+ MB


In [12]:
data = df.dropna().reset_index()

In [13]:
data.to_csv("data.csv")