In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import logging
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed

import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def setup_driver():
    """Set up and return a configured Chrome WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-notifications")
    # chrome_options.add_argument('--proxy-server=http://157.230.149.107:1040')  # Public proxy


    # Initialize the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    return driver


In [2]:
import csv
import threading
import os
import time
import random
from selenium.webdriver.common.by import By

output_file = "HDBank-news.csv"
csv_lock = threading.Lock()  # Lock for thread-safe writing

def write_headers():
    if not os.path.exists(output_file):  # Check if file exists
        with open(output_file, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=[
                "image", "title", "description", "date","detail", "detail_images"
            ])
            writer.writeheader()

# Call write_headers once to ensure headers are written if the file doesn't exist
write_headers()


In [3]:
def extract_news(url,index):
    logger.info(f"Start Extraction of News detail {index} form Web")
    item = setup_driver()
    item.get(url)
    time.sleep(15)
    try:
        # Assuming `driver` is already loaded on the detail page
        wrapper = item.find_element(By.CLASS_NAME, "wrapper-content")

        item.save_screenshot("page.png")
        
        # Get all <p> tags inside the wrapper
        paragraphs = wrapper.find_elements(By.TAG_NAME, "p")

        all_paragraphs = []
        all_images = []
        for p in paragraphs:
            # Collect text if exists
            text = p.text.strip()
            if text:
                all_paragraphs.append(text)

            # Check for <img> tag inside this <p>
            try:
                img = p.find_element(By.TAG_NAME, "img").get_attribute("src")
                all_images.append(img)
            except:
                pass  # Skip if no image
            
        return all_images, all_paragraphs
        
    except Exception as e:
        print(f"[{index + 1}] Failed to extract info: {e}")
        return [],[]

In [4]:
def store_data(row, index,href):
    logger.info(f"Start Extraction of News {index} form Web")
    try:
        all_images, all_paragraphs = extract_news(href,index)
        row['detail']=all_paragraphs
        row['detail_images']=all_images
    except Exception as e:
        print(f"[{index + 1}] Failed to Extract Detail: {e}")
        
    try:
        
        with csv_lock:
            with open(output_file, "a", newline="", encoding="utf-8") as f:
                writer = csv.DictWriter(f, fieldnames=row.keys())
                writer.writerow(row)
                
        logger.info(f"Extraction Complete of News {index} form Web")
        return row

    except Exception as e:
        print(f"[{index + 1}] Failed to insert info: {e}")

In [5]:
driver = setup_driver()
driver.get("https://hdbank.com.vn/news/moi-nhat?page=1000")

time.sleep(15)
try:
    elements = driver.find_elements(By.XPATH, "//li[contains(@class, 'item-news')]")
    element_count = len(elements)
    logger.info(f"Found {element_count} elements to scrape")
    driver.save_screenshot("page.png")
except Exception as e:
    logger.error(f"An error occurred during scraping: {str(e)}", exc_info=True)
element_count = len(elements)
print(f"Found {element_count} elements.")




2025-04-15 10:47:14,796 - INFO - Found 864 elements to scrape


Found 864 elements.


In [None]:
element_count = len(elements)

with ThreadPoolExecutor(max_workers=min(4, element_count)) as executor:
    futures_to_indices = {}
    
    for index, element in enumerate(elements):
        try:
            if index > 558:
                # Image
                try:
                    img = element.find_element(By.XPATH, ".//div[contains(@class, 'news-left-box')]//img").get_attribute("src")
                except:
                    img = ""

                # Title
                title = element.find_element(By.XPATH, ".//div[contains(@class, 'news-right-box')]//p[contains(@class, 'news-title')]").text.strip()

                # Description
                try:
                    description = element.find_element(By.XPATH, ".//p[contains(@class, 'news-description')]").text.strip()
                except:
                    description = ""

                # Date
                date = element.find_element(By.XPATH, ".//p[contains(@class, 'news-date')]/time").text.strip()

                row = {
                    "image": img,
                    "title": title,
                    "description": description,
                    "date": date
                }
                href = elements[1].find_element(By.XPATH, ".//div[contains(@class, 'news-left-box')]/a").get_attribute("href")
                future = executor.submit(store_data, row, index, href)
                futures_to_indices[future] = index
        except Exception as e:
            print(f"[{index + 1}] Failed to extract info: {e}")

    for future in as_completed(futures_to_indices):
        index = futures_to_indices[future]
        try:
            result = future.result()
            logger.info(f"Successfully scraped store at index {index}")
        except Exception as e:
            logger.error(f"Error scraping store at index {index}: {e}", exc_info=True)

2025-04-15 10:47:29,763 - INFO - Start Extraction of News 559 form Web
2025-04-15 10:47:29,767 - INFO - Start Extraction of News detail 559 form Web
2025-04-15 10:47:29,919 - INFO - Start Extraction of News 560 form Web
2025-04-15 10:47:29,924 - INFO - Start Extraction of News detail 560 form Web
2025-04-15 10:47:30,112 - INFO - Start Extraction of News 561 form Web
2025-04-15 10:47:30,116 - INFO - Start Extraction of News detail 561 form Web
2025-04-15 10:47:30,329 - INFO - Start Extraction of News 562 form Web
2025-04-15 10:47:30,341 - INFO - Start Extraction of News detail 562 form Web
2025-04-15 10:48:06,597 - INFO - Extraction Complete of News 562 form Web
2025-04-15 10:48:06,601 - INFO - Start Extraction of News 563 form Web
2025-04-15 10:48:06,605 - INFO - Start Extraction of News detail 563 form Web
2025-04-15 10:48:13,709 - INFO - Extraction Complete of News 560 form Web
2025-04-15 10:48:13,712 - INFO - Start Extraction of News 564 form Web
2025-04-15 10:48:13,716 - INFO - Sta

[598] Failed to extract info: Message: no such element: Unable to locate element: {"method":"css selector","selector":".wrapper-content"}
  (Session info: chrome=135.0.7049.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7689A5335+78597]
	GetHandleVerifier [0x00007FF7689A5390+78688]
	(No symbol) [0x00007FF7687591AA]
	(No symbol) [0x00007FF7687AF149]
	(No symbol) [0x00007FF7687AF3FC]
	(No symbol) [0x00007FF768802467]
	(No symbol) [0x00007FF7687D712F]
	(No symbol) [0x00007FF7687FF2BB]
	(No symbol) [0x00007FF7687D6EC3]
	(No symbol) [0x00007FF7687A03F8]
	(No symbol) [0x00007FF7687A1163]
	GetHandleVerifier [0x00007FF768C4EEED+2870973]
	GetHandleVerifier [0x00007FF768C49698+2848360]
	GetHandleVerifier [0x00007FF768C66973+2967875]
	GetHandleVerifier [0x00007FF7689C017A+188746]
	GetHandleVerifier [0x00007FF7689C845F+222255]
	GetHandleVerifier [0x00007FF

2025-04-15 10:57:29,776 - INFO - Extraction Complete of News 596 form Web
2025-04-15 10:57:29,785 - INFO - Start Extraction of News 601 form Web
2025-04-15 10:57:29,786 - INFO - Successfully scraped store at index 596
2025-04-15 10:57:29,791 - INFO - Start Extraction of News detail 601 form Web
2025-04-15 10:57:49,908 - INFO - Extraction Complete of News 598 form Web
2025-04-15 10:57:49,912 - INFO - Start Extraction of News 602 form Web
2025-04-15 10:57:49,912 - INFO - Successfully scraped store at index 598
2025-04-15 10:57:49,914 - INFO - Start Extraction of News detail 602 form Web
2025-04-15 10:57:53,930 - INFO - Extraction Complete of News 599 form Web
2025-04-15 10:57:53,934 - INFO - Start Extraction of News 603 form Web
2025-04-15 10:57:53,939 - INFO - Start Extraction of News detail 603 form Web
2025-04-15 10:57:53,935 - INFO - Successfully scraped store at index 599
2025-04-15 10:58:16,589 - INFO - Extraction Complete of News 600 form Web
2025-04-15 10:58:16,593 - INFO - Start