In [12]:
import csv
from bs4 import BeautifulSoup
import requests

# Function to extract data from a single block
def extract_data_from_block(block):
    data = {}
    date = block.find(class_="typography-body2 typography-primary-color typography-sm- typography-md- typography-lg- text-gray-600 mb-1")
    data["Date"] = date.get_text(strip=True) if date else ""
    third_party = block.find(class_="typography-body2 text-gray-500 underline cursor-pointer ml-1 text_blue")
    data["Third Party"] = third_party.get_text(strip=True) if third_party else ""
    price = block.find(class_="font-normal text-red-600 ml-1")
    data["Price"] = price.get_text(strip=True) if price else ""
    mileage_location = block.find_all(class_="typography-body2 typography-primary-color typography-sm- typography-md- typography-lg- text-gray-600")
    data["Mileage"] = mileage_location[0].get_text(strip=True) if mileage_location else ""
    data["Location"] = mileage_location[1].get_text(strip=True) if len(mileage_location) > 1 else ""
    data["Event"] = "Sold" if "Purchase reported" in block.text else "Not Sold"
    return data

# Function to extract data from multiple blocks
def extract_data_from_blocks(arr_urls):
    all_data = []
    for url in arr_urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        # First find the main container if it exists
        main_container = soup.find("div", class_="pt-10 md:pt-0")
        if not main_container:
            continue

        blocks = main_container.find_all(class_="flex flex-col border-l-[1px] border-black pl-8 py-4 relative")
        
        for block in blocks:
            data = extract_data_from_block(block)
            data["URL"] = url
            all_data.append(data)
    return all_data

# File path
file_path = r"D:\GitHub\Work-Experiences\Tuan Loc Commodities\2. Scrapping Classic\2.1 My Own Scrapping Code\2.1.2 Scraping Vehicle History\found links.txt"
output_file_path = r"output.csv"  # Tên tệp CSV đầu ra

# Read URLs from the text file
with open(file_path, 'r') as file:
    arr_urls = file.read().splitlines()

# Extract data from multiple blocks
all_data = extract_data_from_blocks(arr_urls)

# Write collected data to CSV file
with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["URL", "Date", "Third Party", "Price", "Mileage", "Location", "Event"])
    writer.writeheader()
    writer.writerows(all_data)

print("Data has been successfully written to", output_file_path)


Data has been successfully written to output.csv


In [15]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Initialize the WebDriver
driver = webdriver.Chrome()

# Read the URLs from a text file
file_path = r"D:\GitHub\Work-Experiences\Tuan Loc Commodities\2. Scrapping Classic\2.1 My Own Scrapping Code\2.1.2 Scraping Vehicle History\found links.txt"

with open(file_path, 'r') as file:
    arr_urls = file.read().splitlines()

# Prepare to collect data
all_data = []

# Loop through each URL
for url in arr_urls:
    driver.get(url)
    time.sleep(5)  # Ensure the page has loaded

    # First find the main container if it exists
    try:
        main_container = driver.find_element(By.XPATH, "//div[contains(@class, 'pt-10 md:pt-0')]")
        blocks = main_container.find_elements(By.XPATH, ".//div[@class='flex flex-col border-l-[1px] border-black pl-8 py-4 relative']")
    except Exception:
        continue  # If the main container isn't found, skip to the next URL

    for block in blocks:
        # Initialize data dictionary
        data = {'URL': url}

        # Extract Date
        try:
            date_element = block.find_element(By.XPATH, ".//span[contains(@class, 'typography-body2')]")
            date = date_element.text.strip()
            data['Date'] = date
        except Exception:
            pass

        # Extract Third Party
        third_party = ''
        try:
            third_party_element = block.find_element(By.XPATH, ".//a[contains(@class, 'underline')]")
            third_party = third_party_element.text
        except Exception:
            pass
        data['Third Party'] = third_party

        # Extract Price
        price = ''
        try:
            price_element = block.find_element(By.XPATH, ".//span[contains(@class, 'font-normal text-black')]")
            price = price_element.text
        except Exception:
            pass
        data['Price'] = price

        # Extract Mileage and Location
        mileage, location = '', ''  # Default values
        try:
            location_element = driver.find_element(By.XPATH, "(//*[@class='typography-body2 typography-primary-color typography-sm- typography-md- typography-lg- text-gray-600'])[1]")
            location = location_element.text
            
            mileage_element = driver.find_element(By.XPATH, "(//*[@class='typography-body2 typography-primary-color typography-sm- typography-md- typography-lg- text-gray-600'])[2]")
            mileage = mileage_element.text
        except Exception:
            pass
        data['Mileage'] = mileage
        data['Location'] = location

        # Extract Event (e.g., Sold, Purchase reported)
        event = ''
        try:
            event_element = block.find_element(By.XPATH, ".//span[contains(@class, 'typography-body2') and contains(text(), 'Sold at')]")
            event = event_element.text
        except Exception:
            try:
                event_element = block.find_element(By.XPATH, ".//span[contains(@class, 'typography-body2') and contains(text(), 'Purchase reported')]")
                event = event_element.text
            except Exception:
                pass
        data['Event'] = event

        # Append data if not already included
        data_dict = {'URL': url, 'Date': date, 'Third Party': third_party, 'Price': price, 'Mileage': mileage, 'Location': location}
        if data_dict not in all_data:
            all_data.append(data_dict)

# Create DataFrame and remove duplicates
df = pd.DataFrame(all_data)
# Drop rows with NaN values in all columns except 'URL'
df = df.dropna(subset=df.columns.difference(['URL']), how='all')
df = df.drop_duplicates(subset=['URL', 'Date', 'Third Party', 'Price', 'Mileage', 'Location'])

# Save the DataFrame to a CSV file
df.to_csv('output_data.csv', index=False)

# Clean up
driver.quit()


In [25]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Initialize the WebDriver
driver = webdriver.Chrome()

# Read the URLs from a text file
file_path = r"D:\GitHub\Work-Experiences\Tuan Loc Commodities\2. Scrapping Classic\2.1 My Own Scrapping Code\2.1.2 Scraping Vehicle History\found links.txt"

with open(file_path, 'r') as file:
    arr_urls = file.read().splitlines()

# Prepare to collect data
all_data = []

# Loop through each URL
for url in arr_urls:
    driver.get(url)
    time.sleep(5)  # Ensure the page has loaded

    # First find the main container if it exists
    try:
        main_container = driver.find_element(By.XPATH, "//div[contains(@class, 'pt-10 md:pt-0')]")
        blocks = main_container.find_elements(By.XPATH, ".//div[@class='flex flex-col border-l-[1px] border-black pl-8 py-4 relative']")
    except Exception:
        continue  # If the main container isn't found, skip to the next URL

    for block in blocks:
        # Initialize data dictionary
        data = {'URL': url}

        # Extract Date
        try:
            date_element = block.find_element(By.XPATH, ".//span[contains(@class, 'typography-body2')]")
            date = date_element.text.strip()
            data['Date'] = date
        except Exception:
            pass

        # Extract Third Party
        third_party = ''
        try:
            third_party_element = block.find_element(By.XPATH, ".//a[contains(@class, 'underline')]")
            third_party = third_party_element.text
        except Exception:
            pass
        data['Third Party'] = third_party

        # Extract Price
        price = ''
        try:
            price_element = block.find_element(By.XPATH, ".//span[contains(@class, 'font-normal text-black')]")
            price = price_element.text
        except Exception:
            pass
        data['Price'] = price

        # Extract Mileage and Location
        mileage, location = '', ''  # Default values
        try:
            location_element = driver.find_element(By.XPATH, "(//*[@class='typography-body2 typography-primary-color typography-sm- typography-md- typography-lg- text-gray-600'])[1]")
            location = location_element.text
            
            mileage_element = driver.find_element(By.XPATH, "(//*[@class='typography-body2 typography-primary-color typography-sm- typography-md- typography-lg- text-gray-600'])[2]")
            mileage = mileage_element.text
        except Exception:
            pass
        data['Mileage'] = mileage
        data['Location'] = location

        # Extract Event (e.g., Sold, Purchase reported)
        event_text = ''
        try:
            event_elements = block.find_elements(By.XPATH, ".//span[contains(@class, 'typography')]")
            for element in event_elements:
                event_text += element.text.strip() + " "
        except Exception:
            pass

        # Append data if not already included
        data_dict = {'URL': url, 'Date': date, 'Third Party': third_party, 'Price': price, 'Mileage': mileage, 'Location': location, 'Event': event_text.strip()}
        if data_dict not in all_data:
            all_data.append(data_dict)

# Create DataFrame and remove duplicates
df = pd.DataFrame(all_data)
# Drop rows with NaN values in all columns except 'URL'
df = df.dropna(subset=df.columns.difference(['URL']), how='all')
df = df.drop_duplicates(subset=['URL', 'Date', 'Third Party', 'Price', 'Mileage', 'Location'])

# Save the DataFrame to a CSV file
df.to_csv('output_data1.csv', index=False)

# Clean up
driver.quit()


In [24]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initialize the WebDriver
driver = webdriver.Chrome()

# Read the URLs from a text file
file_path = r"D:\GitHub\Work-Experiences\Tuan Loc Commodities\2. Scrapping Classic\2.1 My Own Scrapping Code\2.1.2 Scraping Vehicle History\found links.txt"

with open(file_path, 'r') as file:
    arr_urls = file.read().splitlines()

# Prepare to collect data
all_data = []

# Loop through each URL
for url in arr_urls:
    driver.get(url)
    time.sleep(5)  # Ensure the page has loaded

    # Find and click on the HISTORY tab
    try:
        history_tab = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'tab-nav-trigger') and @data-tab='history']"))
    )
        history_tab.click()
        print("Clicked on the HISTORY tab successfully.")
    except Exception as e:
        print(f"Failed to click on the HISTORY tab: {str(e)}")

    # Find the main container within the HISTORY tab
    try:
        main_container = driver.find_element(By.ID, "vehicle-tabs")
        blocks = main_container.find_elements(By.XPATH, ".//div[contains(@class, 'block')]")
    except Exception as e:
        print(f"Failed to locate the main container for URL: {url}")
        print(e)
        continue  # If the main container isn't found, skip to the next URL

    for block in blocks:
        data = {'URL': url}

        try:
            # Assuming structure for Date, Third Party, Price, Mileage, Location is known and consistent
            date = block.find_element(By.XPATH, "./div[@class='date']").text
            third_party = block.find_element(By.XPATH, "./div[@class='third-party']").text
            price = block.find_element(By.XPATH, "./div[@class='price']").text
            mileage = block.find_element(By.XPATH, "./div[@class='mileage']").text
            location = block.find_element(By.XPATH, "./div[@class='location']").text
            event_text = block.find_element(By.XPATH, "./div[@class='event']").text

            data.update({'Date': date, 'Third Party': third_party, 'Price': price, 'Mileage': mileage, 'Location': location, 'Event': event_text})
        except Exception as e:
            print(f"Failed to extract data for a block in URL: {url}")
            print(e)
            continue  # If any information is missing, skip this block

        # Append data if not already included
        if data not in all_data:
            all_data.append(data)

# Create DataFrame and remove duplicates
df = pd.DataFrame(all_data)
df = df.drop_duplicates()

# Save the DataFrame to a CSV file
df.to_csv('output_datafinak.csv', index=False)

# Clean up
driver.quit()


Failed to click on the HISTORY tab: Message: element click intercepted: Element <li class="tab-nav-trigger p-2 cursor-pointer tab-nav-3" data-tab-index="3" data-tab="history" phx-click="[[&quot;add_class&quot;,{&quot;time&quot;:200,&quot;names&quot;:[&quot;hidden&quot;],&quot;to&quot;:&quot;#vehicle-tabs .tab-item&quot;,&quot;transition&quot;:[[],[],[]]}],[&quot;remove_class&quot;,{&quot;time&quot;:200,&quot;names&quot;:[&quot;hidden&quot;],&quot;to&quot;:&quot;#vehicle-tabs .tab-item[data-tab='history']&quot;,&quot;transition&quot;:[[],[],[]]}],[&quot;remove_class&quot;,{&quot;time&quot;:200,&quot;names&quot;:[&quot;is-active&quot;,&quot;border-b-4&quot;,&quot;border-blue-500&quot;,&quot;text-blue-500&quot;],&quot;to&quot;:&quot;#vehicle-tabs .tab-nav-trigger&quot;,&quot;transition&quot;:[[],[],[]]}],[&quot;add_class&quot;,{&quot;time&quot;:200,&quot;names&quot;:[&quot;is-active&quot;,&quot;border-b-4&quot;,&quot;border-blue-500&quot;,&quot;text-blue-500&quot;],&quot;to&quot;:&quot;#v