In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time
import re

In [11]:
# 设置Chrome选项
options = Options()
# options.add_argument("--headless")  # 无头模式，取消注释可启用
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")

# 初始化WebDriver
driver = webdriver.Chrome(options=options)

# 要爬取的页面URL
url = "https://esf.fang.com/house-a0987-b05499/"

# 存储所有房源数据的列表
all_houses = []

# 爬取函数
def scrape_houses():
    try:
        # 打开URL
        driver.get(url)
        print("成功打开页面")
        
        # 等待页面加载
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.shop_list"))
        )
        
        page_num = 1
        while True:
            print(f"正在爬取第 {page_num} 页")
            
            # 给页面加载时间
            time.sleep(2)
            
            # 获取当前页面的所有房源
            houses = driver.find_elements(By.CSS_SELECTOR, "dl.clearfix")
            
            for house in houses:
                try:
                    # 提取标题
                    title = house.find_element(By.CSS_SELECTOR, "h4.clearfix a").get_attribute("title")
                    
                    # 提取详情信息
                    tel_shop = house.find_element(By.CSS_SELECTOR, "p.tel_shop").text
                    details = tel_shop.split("|")
                    
                    # 格式化输出详细信息
                    huxing = details[0].strip() if len(details) > 0 else ""
                    area = details[1].strip() if len(details) > 1 else ""
                    floor = details[2].strip() if len(details) > 2 else ""
                    direction = details[3].strip() if len(details) > 3 else ""
                    build_year = details[4].strip() if len(details) > 4 else ""
                    
                    # 提取地址
                    address = house.find_element(By.CSS_SELECTOR, "p.add_shop").text.strip()
                    
                    # 提取价格
                    price_right = house.find_element(By.CSS_SELECTOR, "dd.price_right")
                    total_price = price_right.find_element(By.CSS_SELECTOR, "span.red").text
                    
                    # 修正单价提取方式
                    try:
                        # 直接获取单价元素
                        unit_price_elem = price_right.find_elements(By.CSS_SELECTOR, "span")
                        # 通常单价是第二个span
                        if len(unit_price_elem) > 1:
                            unit_price = unit_price_elem[1].text
                        else:
                            unit_price = "未知"
                    except:
                        unit_price = "未知"
                    
                    # 将数据添加到列表
                    house_data = {
                        "标题": title,
                        "户型": huxing,
                        "面积": area,
                        "楼层": floor,
                        "朝向": direction,
                        "建筑年份": build_year,
                        "地址": address,
                        "总价": total_price,
                        "单价": unit_price
                    }
                    all_houses.append(house_data)
                    
                except Exception as e:
                    print(f"提取房源信息时出错: {e}")
            
            # 检查是否有下一页
            try:
                # 修改翻页按钮定位方式，使用链接属性
                next_page = driver.find_element(By.LINK_TEXT, "下一页")
                
                # 如果找到下一页按钮，点击进入下一页
                next_page.click()
                page_num += 1
                
                # 等待新页面加载
                time.sleep(3)  # 增加等待时间确保页面加载完成
                
            except NoSuchElementException:
                try:
                    # 备用定位方式
                    next_page = driver.find_element(By.PARTIAL_LINK_TEXT, "下一页")
                    next_page.click()
                    page_num += 1
                    time.sleep(3)
                except:
                    print("没有下一页或已达到最后一页")
                    break
                
    except Exception as e:
        print(f"爬取过程中出错: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        # 关闭浏览器
        driver.quit()

# 保存数据到CSV
def save_to_csv():
    if all_houses:
        df = pd.DataFrame(all_houses)
        df.to_csv('二手房数据.csv', index=False, encoding='utf-8-sig')
        print(f"成功爬取 {len(all_houses)} 条房源数据，已保存到二手房数据.csv")
    else:
        print("未爬取到任何数据")

# 执行爬虫
scrape_houses()
save_to_csv()

成功打开页面
正在爬取第 1 页
正在爬取第 2 页
正在爬取第 3 页
正在爬取第 4 页
正在爬取第 5 页
正在爬取第 6 页
正在爬取第 7 页
正在爬取第 8 页
正在爬取第 9 页
正在爬取第 10 页
正在爬取第 11 页
正在爬取第 12 页
正在爬取第 13 页
正在爬取第 14 页
正在爬取第 15 页
没有下一页或已达到最后一页
成功爬取 857 条房源数据，已保存到二手房数据.csv


In [12]:
# 设置Chrome选项
options = Options()
# options.add_argument("--headless")  # 无头模式，取消注释可启用
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")

# 初始化WebDriver
driver = webdriver.Chrome(options=options)

# 要爬取的页面URL
url = "https://zu.fang.com/house-a0987-b05499/"

# 存储所有房源数据的列表
all_houses = []

# 爬取函数
def scrape_houses():
    try:
        # 打开URL
        driver.get(url)
        print("成功打开页面")
        
        # 等待页面加载
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.houseList"))
        )
        
        page_num = 1
        while True:
            print(f"正在爬取第 {page_num} 页")
            
            # 给页面加载时间
            time.sleep(2)
            
            # 获取当前页面的所有房源
            houses = driver.find_elements(By.CSS_SELECTOR, "div.houseList dl")
            
            for house in houses:
                try:
                    # 1. 提取房源ID、标题和链接
                    title_elem = house.find_element(By.CSS_SELECTOR, "p.title")
                    house_id = title_elem.get_attribute("id")
                    link_elem = title_elem.find_element(By.CSS_SELECTOR, "a")
                    title = link_elem.get_attribute("title")
                    link = link_elem.get_attribute("href")
                    
                    # 2. 提取租赁方式、户型、面积、朝向
                    info_elem = house.find_element(By.CSS_SELECTOR, "p.font15.mt12.bold")
                    info_text = info_elem.text
                    # 分割信息
                    info_parts = info_text.split('|')
                    rent_type = info_parts[0].strip() if len(info_parts) > 0 else ""
                    house_type = info_parts[1].strip() if len(info_parts) > 1 else ""
                    area = info_parts[2].strip() if len(info_parts) > 2 else ""
                    direction = info_parts[3].strip() if len(info_parts) > 3 else ""
                    
                    # 3. 提取地址
                    address_elem = house.find_element(By.CSS_SELECTOR, "p.gray6.mt12")
                    address = address_elem.text.strip()
                    
                    # 4. 提取价格
                    price_elem = house.find_element(By.CSS_SELECTOR, "span.price")
                    price = price_elem.text.strip() + "元/月"
                    
                    # 将数据添加到列表
                    house_data = {
                        "房源ID": house_id,
                        "标题": title,
                        "链接": link,
                        "租赁方式": rent_type,
                        "户型": house_type,
                        "面积": area,
                        "朝向": direction,
                        "地址": address,
                        "价格": price
                    }
                    all_houses.append(house_data)
                    
                except Exception as e:
                    print(f"提取房源信息时出错: {e}")
            
            # 检查是否有下一页
            try:
                # 使用链接文本定位下一页按钮
                next_page = driver.find_element(By.LINK_TEXT, "下一页")
                
                # 如果找到下一页按钮，点击进入下一页
                next_page.click()
                page_num += 1
                
                # 等待新页面加载
                time.sleep(3)  # 增加等待时间确保页面加载完成
                
            except NoSuchElementException:
                try:
                    # 备用定位方式
                    next_page = driver.find_element(By.PARTIAL_LINK_TEXT, "下一页")
                    next_page.click()
                    page_num += 1
                    time.sleep(3)
                except:
                    print("没有下一页或已达到最后一页")
                    break
                
    except Exception as e:
        print(f"爬取过程中出错: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        # 关闭浏览器
        driver.quit()

# 保存数据到CSV
def save_to_csv():
    if all_houses:
        df = pd.DataFrame(all_houses)
        df.to_csv('租房数据.csv', index=False, encoding='utf-8-sig')
        print(f"成功爬取 {len(all_houses)} 条房源数据，已保存到租房数据.csv")
    else:
        print("未爬取到任何数据")

# 执行爬虫
scrape_houses()
save_to_csv()

成功打开页面
正在爬取第 1 页
正在爬取第 2 页
正在爬取第 3 页
正在爬取第 4 页
正在爬取第 5 页
正在爬取第 6 页
正在爬取第 7 页
正在爬取第 8 页
正在爬取第 9 页
正在爬取第 10 页
没有下一页或已达到最后一页
成功爬取 547 条房源数据，已保存到租房数据.csv
