In [1]:
import os
import requests
import time
import random 
from tqdm import tqdm 
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import uuid
import json

In [6]:
# Initialize google chrome browser
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(
    options=chrome_options
)

def download_image(img_url, save_folder):
    try: 
        response = requests.get(img_url, stream=True, timeout=10)
        if response.status_code == 200:
            # Tạo tên file ngẫu nhiên để tránh trùng
            ext = img_url.split('.')[-1].split('?')[0]
            if len(ext) > 4 or ext == "": ext = "jpg"
            filename = f"{uuid.uuid4()}.{ext}"
            file_path = os.path.join(save_folder, filename)
            
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return filename # Trả về tên file để lưu vào JSON
    except Exception as e:
        print(f"Lỗi tải ảnh {img_url}: {e}")
    return None

# --- SETUP ---
# Initialize google chrome browser
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=chrome_options)

root_dir = './sict_corpus/tintuc'
images_dir = os.path.join(root_dir, 'images') # Tạo folder riêng cho ảnh
os.makedirs(root_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

n_pages = 50
news_id = 0

for page_idx in tqdm(range(1, n_pages + 1)):
    main_url = f'https://sict.haui.edu.vn/vn/tin-tuc/{page_idx}'
    driver.get(main_url)
    
    # Lấy danh sách link bài viết
    news_lst_xpath = '//section[@class="irs-blog-field left-img irs-blog-single-field"]/div[1]/div[1]/div[1]//h2/a'
    news_tags = driver.find_elements(By.XPATH, news_lst_xpath)
    news_page_urls = [tag.get_attribute('href') for tag in news_tags]

    for news_page_url in news_page_urls:
        try:
            driver.get(news_page_url)
            time.sleep(1) # Chờ load trang

            # 1. Định vị vùng nội dung chính
            main_content_xpath = '//section[@class="irs-blog-field irs-blog-single-field"]//div[@class="col-md-8"]'
            try:
                main_content_tag = driver.find_element(By.XPATH, main_content_xpath)
            except:
                continue

            # 2. Lấy Title
            title = ""
            try:
                title_xpath = './/p[@class="pTitle"]'
                title = main_content_tag.find_element(By.XPATH, title_xpath).text.strip()
            except:
                pass

            # 3. Lấy Abstract
            abstract = ""
            try:
                abstract_xpath = './/p[@class="pHead"]'
                abstract = main_content_tag.find_element(By.XPATH, abstract_xpath).text.strip()
            except:
                pass

            # 4. Lấy Body Text VÀ Hình ảnh (QUAN TRỌNG)
            content_text = ""
            images_data = [] # List chứa thông tin ảnh
            
            try:
                paragraphs_xpath = './/p[@class="pBody"]'
                paragraphs_tags = main_content_tag.find_elements(By.XPATH, paragraphs_xpath)
                
                text_parts = []
                target_paragraphs = paragraphs_tags[:-1] if paragraphs_tags else []

                for p_tag in target_paragraphs:
                    #Text
                    text = p_tag.text.strip()
                    if text:
                        text_parts.append(text)
                    
                    #Image
                    imgs_in_p = main_content_tag.find_elements(By.TAG_NAME, "img")
                    for img in imgs_in_p:
                        src = img.get_attribute('src')
                        if src:
                            saved_filename = download_image(src, images_dir)
                            if saved_filename:
                                images_data.append({
                                    "original_url": src,
                                    "local_filename": saved_filename,
                                    "relative_path": f"images/{saved_filename}"
                                })

                content_text = '\n'.join(text_parts)
            except Exception as e:
                print(f"Lỗi xử lý nội dung: {e}")

            # metadata 
            article_data = {
                "id": f"sict_{news_id:05d}",
                "url": news_page_url,
                "title": title,
                "abstract": abstract,
                "content": content_text,
                "images": images_data 
            }

            # Save
            news_filename = f"new_sict_{news_id:05d}.json"
            news_savepath = os.path.join(root_dir, news_filename)
            
            with open(news_savepath, 'w', encoding='utf-8') as f:
                json.dump(article_data, f, ensure_ascii=False, indent=4)
            
            news_id += 1
            driver.back()

        except Exception as e:
            print(f"Lỗi chung tại url {news_page_url}: {e}")
driver.quit()

  2%|▏         | 1/50 [04:10<3:24:23, 250.27s/it]


KeyboardInterrupt: 

In [14]:
len(news_page_urls)

8