In [25]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

def main():
    INPUT_FILENAME = 'Sample data.csv'
    OUTPUT_FILENAME = 'scraped_companies_output.csv'
    
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--log-level=3')
    options.add_argument("start-maximized")
    driver = webdriver.Chrome(service=service, options=options)
    
    urls_df = pd.read_csv(INPUT_FILENAME)
    all_scraped_data = []

    for url in urls_df['URL']:
        driver.get(url)
        time.sleep(5)
        
        if not driver.find_elements(By.CSS_SELECTOR, "h2.noselect"):
            continue

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        company_data = {}
        
        h2_tag = soup.find('h2', class_='noselect')
        if h2_tag:
            company_data['company_name_th'] = h2_tag.text.strip()
        
        h3_tag = soup.find('h3', class_='noselect')
        if h3_tag:
            company_data['company_name_en'] = h3_tag.text.strip()
        
        data_map = {
            'เลขทะเบียน': 'registration_number', 'ประกอบธุรกิจ': 'business_type',
            'สถานะ': 'status', 'วันที่จดทะเบียน': 'registration_date',
            'ทุนจดทะเบียน': 'registered_capital'
        }
        
        detail_panel = soup.find('td', style=lambda value: value and 'width:420px' in value)
        if detail_panel:
            for table in detail_panel.find_all('table'):
                for row in table.find_all('tr'):
                    cells = row.find_all('td')
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True).replace(':', '')
                        if key in data_map:
                            company_data[data_map[key]] = cells[1].get_text(separator=' ', strip=True)
                        if key == 'ที่ตั้ง':
                            address_tag = cells[1].find('a', class_='noselect')
                            company_data['address'] = address_tag.get_text(strip=True) if address_tag else cells[1].get_text(strip=True)
        
        if company_data:
            all_scraped_data.append(company_data)

    driver.quit()

    if all_scraped_data:
        df = pd.DataFrame(all_scraped_data)
        final_columns = ['company_name_th', 'company_name_en', 'registration_number', 'business_type', 'status', 'registration_date', 'registered_capital', 'address']
        df = df.reindex(columns=final_columns)
        df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8-sig')

if __name__ == '__main__':
    main()

KeyboardInterrupt: 

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

def perform_login(driver, username, password):
    """Handles the process of entering credentials and clicking the login button."""
    driver.find_element(By.ID, 'login-username').send_keys(username)
    driver.find_element(By.ID, 'login-password').send_keys(password)
    driver.find_element(By.ID, 'btn-login').click()
    time.sleep(5)

def main():
    LOGIN_URL = 'https://www.dataforthai.com/login'
    USERNAME = ''
    PASSWORD = ''
    INPUT_FILENAME = 'Sample data.csv'
    OUTPUT_FILENAME = 'scraped_companies_output.csv'
    
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--log-level=3')
    options.add_argument("start-maximized")
    driver = webdriver.Chrome(service=service, options=options)
    
    driver.get(LOGIN_URL)
    time.sleep(3)
    perform_login(driver, USERNAME, PASSWORD)

    urls_df = pd.read_csv(INPUT_FILENAME)
    all_scraped_data = []

    for url in urls_df['URL']:
        driver.get(url)
        time.sleep(5)
        relogin_prompt = driver.find_elements(By.CSS_SELECTOR, "div.panel-danger")
        if relogin_prompt:
            driver.find_element(By.CSS_SELECTOR, "a.btn-success[href='/login']").click()
            time.sleep(3)
            perform_login(driver, USERNAME, PASSWORD)
            driver.get(url)
            time.sleep(5)
            
        if not driver.find_elements(By.CSS_SELECTOR, "h2.noselect"):
            continue

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        company_data = {}
        
        h2_tag = soup.find('h2', class_='noselect')
        if h2_tag:
            company_data['company_name_th'] = h2_tag.text.strip()
        
        h3_tag = soup.find('h3', class_='noselect')
        if h3_tag:
            company_data['company_name_en'] = h3_tag.text.strip()
        
        data_map = {
            'เลขทะเบียน': 'registration_number', 'ประกอบธุรกิจ': 'business_type',
            'สถานะ': 'status', 'วันที่จดทะเบียน': 'registration_date',
            'ทุนจดทะเบียน': 'registered_capital'
        }
        
        detail_panel = soup.find('td', style=lambda value: value and 'width:420px' in value)
        if detail_panel:
            for table in detail_panel.find_all('table'):
                for row in table.find_all('tr'):
                    cells = row.find_all('td')
                    if len(cells) >= 2:
                        key = cells[0].get_text(strip=True).replace(':', '')
                        if key in data_map:
                            company_data[data_map[key]] = cells[1].get_text(separator=' ', strip=True)
                        if key == 'ที่ตั้ง':
                            address_tag = cells[1].find('a', class_='noselect')
                            company_data['address'] = address_tag.get_text(strip=True) if address_tag else cells[1].get_text(strip=True)
        
        if company_data:
            all_scraped_data.append(company_data)

    driver.quit()

    if all_scraped_data:
        df = pd.DataFrame(all_scraped_data)
        final_columns = ['company_name_th', 'company_name_en', 'registration_number', 'business_type', 'status', 'registration_date', 'registered_capital', 'address']
        df = df.reindex(columns=final_columns)
        df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8-sig')

if __name__ == '__main__':
    main()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"a.btn-success[href='/login']"}
  (Session info: chrome=138.0.7204.101); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
	GetHandleVerifier [0x0x711a33+62339]
	GetHandleVerifier [0x0x711a74+62404]
	(No symbol) [0x0x552123]
	(No symbol) [0x0x59a86e]
	(No symbol) [0x0x59ac0b]
	(No symbol) [0x0x5e2f72]
	(No symbol) [0x0x5bf404]
	(No symbol) [0x0x5e07a3]
	(No symbol) [0x0x5bf1b6]
	(No symbol) [0x0x58e7a2]
	(No symbol) [0x0x58f644]
	GetHandleVerifier [0x0x9865c3+2637587]
	GetHandleVerifier [0x0x9819ca+2618138]
	GetHandleVerifier [0x0x7384aa+220666]
	GetHandleVerifier [0x0x7288d8+156200]
	GetHandleVerifier [0x0x72f06d+182717]
	GetHandleVerifier [0x0x719978+94920]
	GetHandleVerifier [0x0x719b02+95314]
	GetHandleVerifier [0x0x704c4a+9626]
	BaseThreadInitThunk [0x0x77a65d49+25]
	RtlInitializeExceptionChain [0x0x77e6d1ab+107]
	RtlGetAppContainerNamedObjectPath [0x0x77e6d131+561]
