In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

In [2]:

# 常量定义
URL = 'https://tj.zu.fang.com/house-a052/'
MAX_PAGES = 20
OUTPUT_FILE = 'zu_wuqing.csv'

# 初始化 WebDriver
driver = webdriver.Chrome()
driver.get(URL)

# 定义数据提取函数
def extract_house_data(row):
    """从单行数据中提取租房信息"""
    try:
        title = row.find_element(By.CSS_SELECTOR, "p.title").text
        details_text = row.find_element(By.CSS_SELECTOR, "p.font15.mt12.bold").text
        details = details_text.split("|")
        rent_type = details[0].strip() if len(details) > 0 else None
        floor = details[1].strip() if len(details) > 1 else None
        area = details[2].strip() if len(details) > 2 else None
        direction = details[3].strip() if len(details) > 3 else None
        locate = row.find_element(By.XPATH, ".//p[contains(@class, 'gray6') and contains(@class, 'mt12')]").text
        price = row.find_element(By.CSS_SELECTOR, "p.mt5.alingC").text
        return [title, rent_type, floor, area, direction, locate, price]
    except NoSuchElementException:
        return None  # 如果某一行数据缺失，返回 None

# 主循环
string_list = []
for i in range(MAX_PAGES):
    try:
        # 等待表格加载完成
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'houseList'))
        )
        rows = table.find_elements(By.TAG_NAME, 'dl')
        data = []

        # 提取每一行数据
        for row in rows:
            house_data = extract_house_data(row)
            if house_data:
                data.append(house_data)

        # 将当前页数据存储到 DataFrame
        df = pd.DataFrame(data, columns=['标题', '租赁类型', '户型', '面积', '朝向', '位置', '价格'])
        string_list.append(df)
        print(f"已爬取第 {i + 1} 页数据")

        # 翻页
        next_page = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//div[@id="rentid_D10_01"]/a[text()="下一页"]'))
        )
        next_page.click()
        time.sleep(2)  # 等待页面加载
    except TimeoutException:
        print("已到达最后一页或页面加载超时")
        break
    except Exception as e:
        print(f"发生异常: {e}")
        break

# 合并所有数据并保存为 CSV 文件
if string_list:
    final_df = pd.concat(string_list, ignore_index=True)
    final_df.to_csv(OUTPUT_FILE, index=False, encoding='utf_8_sig')  # 修改为保存 CSV 文件
    print(f"数据已保存为 {OUTPUT_FILE}")
else:
    print("未爬取到任何数据")

# 关闭浏览器
driver.quit()

已爬取第 1 页数据
已爬取第 2 页数据
已爬取第 3 页数据
已到达最后一页或页面加载超时
数据已保存为 zu_wuqing.csv
