In [40]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import logging
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def setup_driver():
    """Set up and return a configured Chrome WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-notifications")
    # chrome_options.add_argument('--proxy-server=http://157.230.149.107:1040')  # Public proxy


    # Initialize the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    return driver


In [41]:
import csv
import threading
import os
import time
import random
from selenium.webdriver.common.by import By

output_file = "HDBank-FAQ.csv"
csv_lock = threading.Lock()  # Lock for thread-safe writing

def write_headers():
    if not os.path.exists(output_file):  # Check if file exists
        with open(output_file, "w", newline="", encoding="utf-8-sig") as f:
            writer = csv.DictWriter(f, fieldnames=[
                "Questions", "Answer"
            ])
            writer.writeheader()

# Call write_headers once to ensure headers are written if the file doesn't exist
write_headers()


In [42]:
def store_data(row, index):
    logger.info(f"Start Extraction of Faqs {index} form Web")

    try:
        
        with csv_lock:
            with open(output_file, "a", newline="", encoding="utf-8-sig") as f:
                writer = csv.DictWriter(f, fieldnames=row.keys())
                writer.writerow(row)
                
        logger.info(f"Extraction Complete of Faqs {index} form Web")
        return row

    except Exception as e:
        print(f"[{index + 1}] Failed to insert info: {e}")

In [43]:
def scrape_page(driver):
    """Scrape one page."""

    try:
        try:
            section = driver.find_element(By.XPATH, "//ul[contains(@class, 'list-questions')]")
        except NoSuchElementException:
            logger.error("Failed to find the section with class 'UL list-questions'", exc_info=True)

        if section:
            try:
                elements = section.find_elements(By.XPATH, ".//li[contains(@class, 'item-question')]")
            except NoSuchElementException:
                logger.error("Failed to find column divs inside row", exc_info=True)

        element_count = len(elements)
        logger.info(f"Found {element_count} elements to scrape on page")
        
        with ThreadPoolExecutor(max_workers=min(4, element_count)) as executor:
            futures_to_indices = {}
            for index, element in enumerate(elements):
                try:
                    # Question
                    try:
                        question = element.find_element(By.XPATH, ".//span[contains(@class, 'title-question')]").text
                    except NoSuchElementException:
                        question = "Not Found"
                    
                    # Answer
                    try:
                        html = element.find_element(By.XPATH, ".//div[contains(@class, 'content-answer-box')]").get_attribute("innerHTML")
                        soup = BeautifulSoup(html, "html.parser")
                        answer = soup.get_text(separator="\n", strip=True)
                    except NoSuchElementException:
                        answer = 'Not Found'

            
                    row = {
                        "Questions": question,
                        "Answer": answer
                    }
                    
                    future = executor.submit(store_data, row, index)
                    futures_to_indices[future] = index
                except Exception as e:
                    print(f"[{index + 1}] Failed to extract info: {e}")
            for future in as_completed(futures_to_indices):
                index = futures_to_indices[future]
                try:
                    result = future.result()
                    logger.info(f"Successfully scraped store at index {index}")
                except Exception as e:
                    logger.error(f"Error scraping store at index {index}: {e}", exc_info=True)
    
    except Exception as e:
        logger.error(f"An error occurred during scraping: {str(e)}", exc_info=True)


In [44]:
def navigate_to_next_page(driver):
    """Navigate to the next page by clicking the 'Next' button."""
    try:
        # Wait for the 'Next' button to be clickable
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//li[@class='next-btn']/a"))
        )
        next_button.click()
        time.sleep(5)  # Adjust this time if necessary
        logger.info("Navigated to next page")
        return driver
    except Exception as e:
        logger.error(f"Error navigating to next page: {e}")
        return driver

In [45]:
def scrape_with_pagination(start_url):
    driver = setup_driver()
    driver.get(start_url)
    time.sleep(5)  # Initial load wait
    count = 1
    while True:
        scrape_page(driver)  # Still pass current context

        # Try to find and click the "Next" button
        try:
            next_btn = driver.find_element(By.XPATH, "//li[@class='next-btn']/a")
            classes = next_btn.get_attribute("class")
            
            if "disabled" in classes:
                logger.info("Reached last page. 'Next' button is disabled.")
                break  # Stop loop when the button is disabled
            
            # Scroll the element into view
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", next_btn)
            time.sleep(2)  # Wait a bit for the scroll to finish and render

            # Optional: highlight the element before screenshot (for better visibility)
            driver.execute_script("arguments[0].style.border='3px solid red'", next_btn)
            next_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//li[@class='next-btn']/a"))
                    )
            # driver.save_screenshot(f"page{count}.png")
            # Proceed to click
            next_button.click()
            time.sleep(10)
            logger.info(f"'Next' button found. And Goes TO next Page({count}).")
            count= count + 1
        except NoSuchElementException:
            logger.error("No 'Next' button found. Stopping pagination.")
            break
        except Exception as e:
            logger.error(f"Unexpected error during pagination: {e}", exc_info=True)
            break

    driver.quit()


In [47]:
# Start the scraping process
# scrape_with_pagination("https://hdbank.com.vn/en/personal/QnA")
scrape_with_pagination("https://hdbank.com.vn/en/corporate/QnA")

2025-04-24 14:28:11,705 - INFO - Found 4 elements to scrape on page
2025-04-24 14:28:11,773 - INFO - Start Extraction of Faqs 0 form Web
2025-04-24 14:28:11,779 - INFO - Extraction Complete of Faqs 0 form Web
2025-04-24 14:28:11,825 - INFO - Start Extraction of Faqs 1 form Web
2025-04-24 14:28:11,827 - INFO - Extraction Complete of Faqs 1 form Web
2025-04-24 14:28:11,879 - INFO - Start Extraction of Faqs 2 form Web
2025-04-24 14:28:11,881 - INFO - Extraction Complete of Faqs 2 form Web
2025-04-24 14:28:11,926 - INFO - Successfully scraped store at index 0
2025-04-24 14:28:11,926 - INFO - Start Extraction of Faqs 3 form Web
2025-04-24 14:28:11,927 - INFO - Successfully scraped store at index 2
2025-04-24 14:28:11,929 - INFO - Successfully scraped store at index 1
2025-04-24 14:28:11,931 - INFO - Extraction Complete of Faqs 3 form Web
2025-04-24 14:28:11,932 - INFO - Successfully scraped store at index 3
2025-04-24 14:28:24,182 - INFO - 'Next' button found. And Goes TO next Page(1).
2025