In [27]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time

def crawl_page_count(driver, original_url):
    try:
        # Find the element containing the page count
        page_count_element = driver.find_element(By.XPATH, "(//*[@class= 'typography-subtitle1 typography-primary-color typography-sm- typography-md- typography-lg-'])[3]")
        page_count_text = page_count_element.text
        page_count = int(page_count_text) if page_count_text.isdigit() else 0
    except NoSuchElementException:
        # If no page count is found, return 1
        page_count = 1
    
    return page_count

# Initialize the browser
driver = webdriver.Chrome()

file_path = r"D:\GitHub\Work-Experiences\Tuan Loc Commodities\2. Scrapping\Links.txt"

with open(file_path, 'r') as file:
    arr_urls = file.read().splitlines()

# Create DataFrame to store data
df_links = pd.DataFrame(columns=['Original Link', 'Page Count'])

for original_url in arr_urls:
    # Access the original page
    driver.get(original_url)
    
    # Get the page count for the current link
    page_count = crawl_page_count(driver, original_url)
    
    # Append the page count to DataFrame
    df_temp = pd.DataFrame({'Original Link': original_url, 'Page Count': page_count}, index=[0])
    df_links = pd.concat([df_links, df_temp], ignore_index=True)

# Save DataFrame to CSV file
df_links.to_csv('page_counts.csv', index=False)

# Print DataFrame to console for inspection
print(df_links)

# Close the browser when done
driver.quit()


                                        Original Link Page Count
0   https://www.classic.com/m/porsche/911/964/carr...          2
1   https://www.classic.com/m/porsche/911/993/carr...         14
2    https://www.classic.com/m/porsche/911/993/turbo/         16
3   https://www.classic.com/m/porsche/911/g-body/c...         20
4   https://www.classic.com/m/porsche/911/g-body/9...         26
5   https://www.classic.com/m/mercedes-benz/s/w126...         11
6   https://www.classic.com/m/mercedes-benz/sl/r10...         76
7    https://www.classic.com/m/mercedes-benz/sl/w121/         24
8   https://www.classic.com/m/mercedes-benz/g/w463...          7
9   https://www.classic.com/m/bmw/3-series/e30/m3/...         14
10       https://www.classic.com/m/bmw/z3/roadster/m/         21
11  https://www.classic.com/m/bmw/3-series/e36/m3/...         23
12  https://www.classic.com/m/lancia/delta/1st-gen...          4
13       https://www.classic.com/m/ferrari/458/coupe/          8
14  https://www.classic.c

In [28]:
def generate_page_links(original_url, page_count):
    # Initialize an empty list to store the generated page links
    page_links = []
    
    # Generate links for each page based on the page count
    for page_number in range(1, page_count + 1):
        page_url = f"{original_url}?page={page_number}"
        page_links.append(page_url)
    
    return page_links

# Read the page counts DataFrame from the CSV file
page_count = pd.read_csv('page_counts.csv')
df1 = page_count.copy()

# Create an empty DataFrame to store generated page links
df_page_links = pd.DataFrame(columns=['Original Link', 'Page Link'])

# Iterate through each link and generate page links
for index, row in df1.iterrows():
    original_url = row['Original Link']
    df1 = row['Page Count']
    
    # Generate page links for the current link
    page_links = generate_page_links(original_url, df1)
    
    # Append page links to DataFrame
    for page_link in page_links:
        df_temp = pd.DataFrame({'Original Link': original_url, 'Page Link': page_link}, index=[0])
        df_page_links = pd.concat([df_page_links, df_temp], ignore_index=True)

# Save generated page links to a text file
with open('page_links.txt', 'w') as file:
    for page_link in df_page_links['Page Link']:
        file.write(page_link + '\n')

# Print generated page links DataFrame for inspection
print(df_page_links)


                                         Original Link  \
0    https://www.classic.com/m/porsche/911/964/carr...   
1    https://www.classic.com/m/porsche/911/964/carr...   
2    https://www.classic.com/m/porsche/911/993/carr...   
3    https://www.classic.com/m/porsche/911/993/carr...   
4    https://www.classic.com/m/porsche/911/993/carr...   
..                                                 ...   
360             https://www.classic.com/m/nissan/gt-r/   
361             https://www.classic.com/m/nissan/gt-r/   
362             https://www.classic.com/m/nissan/gt-r/   
363             https://www.classic.com/m/nissan/gt-r/   
364             https://www.classic.com/m/nissan/gt-r/   

                                             Page Link  
0    https://www.classic.com/m/porsche/911/964/carr...  
1    https://www.classic.com/m/porsche/911/964/carr...  
2    https://www.classic.com/m/porsche/911/993/carr...  
3    https://www.classic.com/m/porsche/911/993/carr...  
4    https://www.c

In [41]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime

def crawl_link_per_page(driver, original_url, df_links, found_links_file):
    # Lấy số lượng trang
    page_count_element = driver.find_element(By.XPATH, "(//*[@class= 'typography-subtitle1 typography-primary-color typography-sm- typography-md- typography-lg-'])[1]")
    page_count_text = page_count_element.text
    page_count = int(page_count_text) if page_count_text.isdigit() else 0

    # Lấy thời gian bắt đầu tạo
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Lấy các phần tử
    df_links_temp = pd.DataFrame(columns=['Original Link', 'Page Count', 'Name', 'Found Link', 'Status', 'Date', 'Time'])
    elements = driver.find_elements(By.XPATH, "//*[@class= 'text-xl leading-5 font-medium table:text-secondary table:text-base flex-1']")
    for element in elements:
        try:
            found_link = element.get_attribute("href")
            name = element.text
            
            # Lưu vào tệp và DataFrame
            with open(found_links_file, 'a') as file:
                file.write(found_link + '\n')
            df_temp = pd.DataFrame({'Original Link': original_url, 'Page Count': page_count, 'Name': name, 'Found Link': found_link, 'Status': '', 'Date': current_time.split()[0], 'Time': current_time.split()[1]}, index=[0])
            df_links_temp = pd.concat([df_links_temp, df_temp], ignore_index=True)
        except Exception as e:
            print(f"Error occurred while processing element: {e}")
        
    # Thêm vào DataFrame chính
    df_links = pd.concat([df_links, df_links_temp], ignore_index=True)
    
    return df_links

def collect_found_links(driver, df_links, found_links_file):
    # Thu thập các liên kết đã tìm thấy và nối vào DataFrame df_links
    with open(found_links_file, 'r') as file:
        found_urls = file.read().splitlines()

    for found_url in found_urls:
        # Lấy thời gian bắt đầu collect
        start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        driver.get(found_url)
        time.sleep(3)  # Thêm một đợi nhỏ để ổn định, bạn có thể điều chỉnh nó
        try:
            # Find the element using the first XPATH expression
            view_listing_element = driver.find_element(By.XPATH, "//*[@class= 'flex md:inline-flex items-center justify-center px-5 py-2 uppercase font-medium tracking-wider whitespace-nowrap rounded transition duration-200 text-blue-500 border border-blue-500 hover:bg-blue-50 w-full h-full']")
            status_element = driver.find_element(By.XPATH, "//*[@class= 'border font-medium uppercase inline-block whitespace-nowrap text-white bg-black border-black px-2 text-lg rounded']")
            status = status_element.text
            df_links.loc[df_links['Found Link'] == found_url, 'Status'] = status

            # If the first XPATH expression doesn't find the element, try the second one
            if not view_listing_element:
                view_listing_element = driver.find_element(By.XPATH, "//*[@class= 'flex md:inline-flex items-center justify-center uppercase font-medium tracking-wider whitespace-nowrap rounded transition duration-200 text-white bg-blue-500 hover:bg-blue-500/90 border border-blue-500 hover:border-blue-500/90 shadow-lg w-full h-full px-5 py-1']")

            # If the element is found, get its href attribute
            view_listing_href = view_listing_element.get_attribute('href')
            status_element = driver.find_element(By.XPATH, "//*[@class= 'border font-medium uppercase inline-block whitespace-nowrap text-red-600 border-red-600 px-2 text-lg rounded']")
            status = status_element.text
            df_links.loc[df_links['Found Link'] == found_url, 'Status'] = status
            # Update the DataFrame with the link if found
            df_links.loc[df_links['Found Link'] == found_url, 'Link View Listing'] = view_listing_href
            # Thêm thời gian bắt đầu collect
            df_links.loc[df_links['Found Link'] == found_url, 'Start Time'] = start_time
        except NoSuchElementException as e:
            print(f'Không tìm thấy liên kết xem danh sách cho {found_url}: {e}')

    
    return df_links

# Khởi tạo trình duyệt
driver = webdriver.Chrome()

file_path = r"D:\GitHub\Work-Experiences\Tuan Loc Commodities\2. Scrapping\page_links copy.txt"

with open(file_path, 'r') as file:
    arr_urls = file.read().splitlines()

# Tạo DataFrame để lưu trữ dữ liệu
df_links = pd.DataFrame(columns=['Original Link', 'Page Count', 'Name', 'Found Link', 'Status', 'Date', 'Time', 'Start Time'])

found_links_file = r"D:\GitHub\Work-Experiences\Tuan Loc Commodities\2. Scrapping\found links.txt"

for original_url in arr_urls:
    # Truy cập trang gốc
    driver.get(original_url)
    # Lấy dữ liệu từ trang
    df_links = crawl_link_per_page(driver, original_url, df_links, found_links_file) 
    time.sleep(5)

# Thu thập các liên kết đã tìm thấy từ arr_urls và nối vào DataFrame df_links
df_links = collect_found_links(driver, df_links, found_links_file)

# Lưu DataFrame vào tệp CSV
df_links.to_csv('df_links.csv', index=False)

# In DataFrame ra console để kiểm tra
print(df_links)

# Đóng trình duyệt khi hoàn tất
driver.quit()


Không tìm thấy liên kết xem danh sách cho https://www.classic.com/veh/1991-porsche-964-carrera-coupe-wp0ab2968ms411173-4o39yB4/: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@class= 'border font-medium uppercase inline-block whitespace-nowrap text-white bg-black border-black px-2 text-lg rounded']"}
  (Session info: chrome=124.0.6367.61); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6A6C71522+60802]
	(No symbol) [0x00007FF6A6BEAC22]
	(No symbol) [0x00007FF6A6AA7CE4]
	(No symbol) [0x00007FF6A6AF6D4D]
	(No symbol) [0x00007FF6A6AF6E1C]
	(No symbol) [0x00007FF6A6B3CE37]
	(No symbol) [0x00007FF6A6B1ABBF]
	(No symbol) [0x00007FF6A6B3A224]
	(No symbol) [0x00007FF6A6B1A923]
	(No symbol) [0x00007FF6A6AE8FEC]
	(No symbol) [0x00007FF6A6AE9C21]
	GetHandleVerifier [0x00007FF6A6F7413D+3217821]
	GetHandleVerifier [0x00007

PermissionError: [Errno 13] Permission denied: 'df_links.csv'