In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import logging
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed

import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def setup_driver():
    """Set up and return a configured Chrome WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-notifications")
    # chrome_options.add_argument('--proxy-server=http://157.230.149.107:1040')  # Public proxy


    # Initialize the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    return driver


In [2]:
import csv
import threading
import os
import time
import random
from selenium.webdriver.common.by import By

output_file = "HDBank-Promotions.csv"
csv_lock = threading.Lock()  # Lock for thread-safe writing

def write_headers():
    if not os.path.exists(output_file):  # Check if file exists
        with open(output_file, "w", newline="", encoding="utf-8-sig") as f:
            writer = csv.DictWriter(f, fieldnames=[
                "image", "title", "short_description", "date","detail"
            ])
            writer.writeheader()

# Call write_headers once to ensure headers are written if the file doesn't exist
write_headers()


In [3]:
def extract_promotions(url,index):
    logger.info(f"Start Extraction of Promotions detail {index} form Web")
    item = setup_driver()
    item.get(url)
    time.sleep(10)
    try:
        # Assuming `driver` is already loaded on the detail page
        wrapper = item.find_element(By.CLASS_NAME, "wrapper-content")
        return wrapper.text
    except Exception as e:
        print(f"[{index + 1}] Failed to extract info: {e}")
        return None

In [4]:
# extract_promotions('https://hdbank.com.vn/en/personal/promotion/detail/ngan-hang-dien-tu/hanh-trinh-vinh-quang-40',1)

In [5]:
def store_data(row, index,href):
    logger.info(f"Start Extraction of News {index} form Web")
    try:
        detail = extract_promotions(href,index)
        row['detail']=detail
    except Exception as e:
        print(f"[{index + 1}] Failed to Extract Detail: {e}")
        
    try:
        
        with csv_lock:
            with open(output_file, "a", newline="", encoding="utf-8-sig") as f:
                writer = csv.DictWriter(f, fieldnames=row.keys())
                writer.writerow(row)
                
        logger.info(f"Extraction Complete of News {index} form Web")
        return row

    except Exception as e:
        print(f"[{index + 1}] Failed to insert info: {e}")

In [6]:
# driver = setup_driver()
# driver.get("https://hdbank.com.vn/en/personal/promotion")

# time.sleep(15)
# section = driver.find_element(By.XPATH, "//section[contains(@class, 'happyzone-section_dayline')]")
# try:
#     elements = section.find_elements(By.XPATH, "//div[contains(@class, 'promotion-item')]")
#     element_count = len(elements)
#     logger.info(f"Found {element_count} elements to scrape")
#     # driver.save_screenshot("page.png")
# except Exception as e:
#     logger.error(f"An error occurred during scraping: {str(e)}", exc_info=True)
# element_count = len(elements)
# print(f"Found {element_count} elements.")


In [7]:
# elements[0].find_element(By.XPATH, ".//div[contains(@class, 'promotion-item_content__title')]/a").get_attribute("href")

In [8]:
def scrape_page(driver):
    """Scrape one page."""

    try:
        try:
            section = driver.find_element(By.XPATH, "//section[contains(@class, 'happyzone-section_dayline')]")
        except NoSuchElementException:
            logger.error("Failed to find the section with class 'happyzone-section_dayline'", exc_info=True)

        if section:
            try:
                container = section.find_element(By.CLASS_NAME, "container")
            except NoSuchElementException:
                logger.error("Failed to find 'container' div inside section", exc_info=True)

        if container:
            try:
                row = container.find_element(By.CLASS_NAME, "row")
            except NoSuchElementException:
                logger.error("Failed to find 'row' div inside container", exc_info=True)

        if row:
            try:
                elements = row.find_elements(By.XPATH, ".//div[contains(@class, 'col-12') and contains(@class, 'col-md-6') and contains(@class, 'col-lg-4')]")
            except NoSuchElementException:
                logger.error("Failed to find column divs inside row", exc_info=True)

        element_count = len(elements)
        logger.info(f"Found {element_count} elements to scrape on page")
        
        with ThreadPoolExecutor(max_workers=min(4, element_count)) as executor:
            futures_to_indices = {}
            for index, element in enumerate(elements):
                try:
                    # Image
                    try:
                        img = element.find_element(By.XPATH, ".//div[contains(@class, 'promotion-item_img')]//img").get_attribute("src")
                    except NoSuchElementException:
                        img = ""
                    
                    # Title
                    try:
                        title = element.find_element(By.XPATH, ".//div[contains(@class, 'promotion-item_content__title')]//p[contains(@class, 'lcl')]").text.strip()
                    except NoSuchElementException:
                        title = ''

                    # Description
                    try:
                        short_description = element.find_element(By.XPATH, ".//div[contains(@class, 'promotion-item_content__desc')]").text.strip()
                    except NoSuchElementException:
                        short_description = ""
                    
                    # Date
                    try:
                        date = element.find_element(By.XPATH, ".//div[contains(@class, 'promotion-item_content__time')]/p").text.strip()
                    except NoSuchElementException:
                        date = ''
                    
                    row = {
                        "image": img,
                        "title": title,
                        "short_description": short_description,
                        "date": date
                    }
                    try:
                        href = elements[index].find_element(By.XPATH, ".//div[contains(@class, 'promotion-item_content__title')]/a").get_attribute("href")
                    except NoSuchElementException:
                        logger.warning(f"[{index + 1}] Skipping item: no href found.")
                        continue  # Skip this iteration and move to the next element
                    
                    future = executor.submit(store_data, row, index, href)
                    futures_to_indices[future] = index
                except Exception as e:
                    print(f"[{index + 1}] Failed to extract info: {e}")
            for future in as_completed(futures_to_indices):
                index = futures_to_indices[future]
                try:
                    result = future.result()
                    logger.info(f"Successfully scraped store at index {index}")
                except Exception as e:
                    logger.error(f"Error scraping store at index {index}: {e}", exc_info=True)
    
    except Exception as e:
        logger.error(f"An error occurred during scraping: {str(e)}", exc_info=True)


In [9]:
def navigate_to_next_page(driver):
    """Navigate to the next page by clicking the 'Next' button."""
    try:
        # Wait for the 'Next' button to be clickable
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//li[@class='next-btn']/a"))
        )
        next_button.click()
        time.sleep(5)  # Adjust this time if necessary
        logger.info("Navigated to next page")
        return driver
    except Exception as e:
        logger.error(f"Error navigating to next page: {e}")
        return driver

In [None]:
def scrape_with_pagination(start_url):
    driver = setup_driver()
    driver.get(start_url)
    time.sleep(5)  # Initial load wait
    count = 1
    while True:
        if count > 7:
            scrape_page(driver)  # Still pass current context

        # Try to find and click the "Next" button
        try:
            next_btn = driver.find_element(By.XPATH, "//li[@class='next-btn']/a")
            classes = next_btn.get_attribute("class")
            
            if "disabled" in classes:
                logger.info("Reached last page. 'Next' button is disabled.")
                break  # Stop loop when the button is disabled
            
            # Scroll the element into view
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", next_btn)
            time.sleep(2)  # Wait a bit for the scroll to finish and render

            # Optional: highlight the element before screenshot (for better visibility)
            driver.execute_script("arguments[0].style.border='3px solid red'", next_btn)
            next_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//li[@class='next-btn']/a"))
                    )
            # driver.save_screenshot(f"page{count}.png")
            # Proceed to click
            next_button.click()
            time.sleep(10)
            logger.info(f"'Next' button found. And Goes TO next Page({count}).")
            count= count + 1
        except NoSuchElementException:
            logger.error("No 'Next' button found. Stopping pagination.")
            break
        except Exception as e:
            logger.error(f"Unexpected error during pagination: {e}", exc_info=True)
            break

    driver.quit()


In [12]:
# Start the scraping process
scrape_with_pagination("https://hdbank.com.vn/en/personal/promotion")

2025-04-24 10:22:56,046 - INFO - 'Next' button found. And Goes TO next Page(1).
2025-04-24 10:23:08,553 - INFO - 'Next' button found. And Goes TO next Page(2).
2025-04-24 10:23:21,074 - INFO - 'Next' button found. And Goes TO next Page(3).
2025-04-24 10:23:33,599 - INFO - 'Next' button found. And Goes TO next Page(4).
2025-04-24 10:23:46,138 - INFO - 'Next' button found. And Goes TO next Page(5).
2025-04-24 10:23:58,667 - INFO - 'Next' button found. And Goes TO next Page(6).
2025-04-24 10:24:11,240 - INFO - 'Next' button found. And Goes TO next Page(7).
2025-04-24 10:24:11,390 - INFO - Found 9 elements to scrape on page
2025-04-24 10:24:11,507 - INFO - Start Extraction of News 0 form Web
2025-04-24 10:24:11,510 - INFO - Start Extraction of Promotions detail 0 form Web
2025-04-24 10:24:11,612 - INFO - Start Extraction of News 1 form Web
2025-04-24 10:24:11,615 - INFO - Start Extraction of Promotions detail 1 form Web
2025-04-24 10:24:11,721 - INFO - Start Extraction of News 2 form Web
2

In [None]:
# driver = setup_driver()
# driver.get("https://hdbank.com.vn/en/personal/promotion")
# time.sleep(10)
# try:
#     next_btn = driver.find_element(By.XPATH, "//li[@class='next-btn']/a")
#     classes = next_btn.get_attribute("class")
    
#     if "disabled" in classes:
#         logger.info("Reached last page. 'Next' button is disabled.")
    
#     # Scroll the element into view
#     driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", next_btn)
#     time.sleep(2)  # Wait a bit for the scroll to finish and render

#     # Optional: highlight the element before screenshot (for better visibility)
#     driver.execute_script("arguments[0].style.border='3px solid red'", next_btn)
  
#     next_button = WebDriverWait(driver, 10).until(
#                 EC.element_to_be_clickable((By.XPATH, "//li[@class='next-btn']/a"))
#             )
#     # Proceed to click
#     next_button.click()
#     time.sleep(10)

# except NoSuchElementException:
#     logger.info("No 'Next' button found. Stopping pagination.")
