In [24]:
#from websocket._core import WebSocketApp
# websocket-client
from selenium import webdriver
from time import sleep
import pandas as pd
from io import StringIO
from functools import partial
import matplotlib.pyplot as plt
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common import NoSuchElementException,StaleElementReferenceException
from matplotlib import style

print(plt.style.available)
%matplotlib inline

driver=webdriver.Edge()

url='https://zu.fang.com/house-a015277-b022/'
driver.get(url)

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'petroff10', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']


In [25]:
import re

In [26]:
# Lists to store the scraped rental data
house_names = []
monthly_rents = []
areas = []

In [27]:
# Function to extract data from a single rental page
def scrape_rental_page(driver):
    # Wait for the listings to load
    sleep(2)
    
    # Find all rental listing elements - adjust selector based on screenshot
    try:
        # We can see div.houseList in the HTML structure
        listings = driver.find_elements(By.CSS_SELECTOR, "div.houseList dl")
        print(f"Found {len(listings)} rental listings on this page")
        
        if len(listings) == 0:
            # Try alternative selectors
            listings = driver.find_elements(By.CSS_SELECTOR, "div.list-box dl")
            print(f"Found {len(listings)} rental listings with alternative selector")
            
            if len(listings) == 0:
                # Try another alternative
                listings = driver.find_elements(By.CSS_SELECTOR, "div[class*='list'] dl")
                print(f"Found {len(listings)} rental listings with second alternative")
        
        for listing in listings:
            try:
                # Extract house name (title)
                try:
                    # Try different selectors for the title
                    selectors = ["p.title a", "a[title]", "p.font15 a", "p.title"]
                    house_name = None
                    
                    for selector in selectors:
                        try:
                            title_element = listing.find_element(By.CSS_SELECTOR, selector)
                            house_name = title_element.text.strip()
                            if house_name:
                                break
                        except:
                            continue
                    
                    if house_name:
                        house_names.append(house_name)
                        print(f"House name: {house_name}")
                    else:
                        house_names.append(None)
                        print("House name not found")
                        
                except Exception as e:
                    print(f"Error extracting house name: {e}")
                    house_names.append(None)
                
                # Extract monthly rent from the right side
                try:
                    # Look for large red numbers on the right side of listings
                    monthly_rent = None
                    listing_text = listing.text
                    
                    # Extract numbers followed by 元/月
                    rent_matches = re.findall(r'(\d+)元/月', listing_text)
                    if rent_matches:
                        monthly_rent = rent_matches[0]
                    else:
                        # Try looking for just large numbers (4 or more digits)
                        large_numbers = re.findall(r'\b(\d{4,})\b', listing_text)
                        if large_numbers:
                            monthly_rent = large_numbers[0]
                    
                    if monthly_rent:
                        monthly_rents.append(monthly_rent)
                        print(f"Monthly rent: {monthly_rent}")
                    else:
                        monthly_rents.append(None)
                        print("Monthly rent not found")
                        
                except Exception as e:
                    print(f"Error extracting monthly rent: {e}")
                    monthly_rents.append(None)
                
                # Extract apartment size - in your screenshot it shows "20m²"
                try:
                    # Based on your screenshot, area is shown with a pattern like "20m²"
                    area = None
                    listing_text = listing.text
                    
                    # Look for patterns like "20m²" or "20㎡"
                    area_matches = re.findall(r'(\d+(?:\.\d+)?)(?:m²|㎡)', listing_text)
                    if area_matches:
                        area = area_matches[0]
                    
                    # If not found, try looking for spans with just area numbers
                    if not area:
                        # Try to find span elements containing just area numbers
                        try:
                            area_elements = listing.find_elements(By.CSS_SELECTOR, "span.splitline")
                            for elem in area_elements:
                                elem_text = elem.text.strip()
                                if "m²" in elem_text or "㎡" in elem_text or (elem_text.isdigit() and len(elem_text) <= 3):
                                    area_matches = re.findall(r'(\d+)', elem_text)
                                    if area_matches:
                                        area = area_matches[0]
                                        break
                        except:
                            pass
                    
                    areas.append(area)
                    print(f"Area: {area}")
                    
                except Exception as e:
                    print(f"Error extracting area: {e}")
                    areas.append(None)
                
            except Exception as e:
                print(f"Error processing rental listing: {e}")
                # Ensure all lists stay the same length
                if len(house_names) < len(monthly_rents) + 1:
                    house_names.append(None)
                if len(monthly_rents) < len(house_names):
                    monthly_rents.append(None)
                if len(areas) < len(house_names):
                    areas.append(None)
    
    except Exception as e:
        print(f"Error finding rental listings: {e}")
    
    # Make sure all lists have the same length
    max_len = max(len(house_names), len(monthly_rents), len(areas))
    while len(house_names) < max_len:
        house_names.append(None)
    while len(monthly_rents) < max_len:
        monthly_rents.append(None)
    while len(areas) < max_len:
        areas.append(None)
        
    return len(listings) > 0

In [28]:
# Loop through the first 10 pages
for page_num in range(1, 8):
    print(f"\n=== Scraping rental page {page_num} ===")
    
    if page_num > 1:
        # Navigate to the next page - try clicking the next page button
        try:
            # First try to find and click the next page button
            next_page = driver.find_element(By.CLASS_NAME, 'fNext')
            driver.execute_script("arguments[0].scrollIntoView();", next_page)
            sleep(1)
            next_page.click()
            sleep(2)
        except NoSuchElementException:
            # If button not found, try direct URL
            try:
                next_page_url = f"{url}i3{page_num}/"
                print(f"Navigating to: {next_page_url}")
                driver.get(next_page_url)
            except Exception as e:
                print(f"Error navigating to page {page_num}: {e}")
                break
    
    # Scrape the current page
    if not scrape_rental_page(driver):
        print(f"No rental listings found on page {page_num}. Stopping.")
        break
    
    # Wait between requests to avoid being blocked
    sleep(3)


=== Scraping rental page 1 ===
Found 60 rental listings on this page
House name: 上地西里颂芳园 3室1卫1厅 70平精装修 2000
Monthly rent: 2000
Area: 15
House name: 13号地铁线上地站 清河站 嘉华大厦 华联
Monthly rent: 1200
Area: 18
House name: 上地附近精装修 准要事情说三遍免佣 免佣 免佣
Monthly rent: 2000
Area: 20
House name: 上地13号地铁 体育大学 周边华联商厦 精装修
Monthly rent: 2000
Area: 20
House name: 怡美家园 4室1卫2厅 95平精装修 3800元/月
Monthly rent: 3800
Area: 25
House name: 上地大厦 上地桥附近精装修 免佣 想看房的速度联系免
Monthly rent: 2000
Area: 20
House name: 上地东里 精装修 准要说三遍 免佣 免佣 免佣
Monthly rent: 2300
Area: 20
House name: 可月付 免佣 13号线地铁上地 站 华联商厦 精装次
Monthly rent: 1100
Area: 60
House name: 上地南路14号院 3室1卫1厅 68平简装修 260
Monthly rent: 2600
Area: 68
House name: 上地东里精装修 格局反正 准要事情说三遍 免佣 免佣
Monthly rent: 2300
Area: 20
House name: 1室1厅西山公馆上园
Monthly rent: 7900
Area: 69
House name: 上地佳园 2室1卫1厅 66平简装修 2000元/月
Monthly rent: 2000
Area: 20
House name: 上地精装修 地铁站附近 免佣 免佣 免佣
Monthly rent: 2100
Area: 21
House name: 随时起租,看房随时联系我 全新装修拎包入住
Monthly rent: 7500
Area: 69
House name: 怡美家园 3室1卫1厅 65平精装修 35

In [29]:
# Create a DataFrame with the collected rental data
rental_data = pd.DataFrame({
    'HouseName': house_names,
    'MonthlyRent': monthly_rents,
    'Area': areas
})

# Display data before cleaning
print("\nRaw rental data:")
print(rental_data.head())

# Clean and convert data types
print("\nCleaning rental data...")
try:
    rental_data['MonthlyRent'] = pd.to_numeric(rental_data['MonthlyRent'], errors='coerce')
    rental_data['Area'] = pd.to_numeric(rental_data['Area'], errors='coerce')
    print("Data types converted successfully")
except Exception as e:
    print(f"Error converting data types: {e}")


Raw rental data:
                    HouseName MonthlyRent Area
0  上地西里颂芳园 3室1卫1厅 70平精装修 2000        2000   15
1       13号地铁线上地站 清河站 嘉华大厦 华联        1200   18
2     上地附近精装修 准要事情说三遍免佣 免佣 免佣        2000   20
3     上地13号地铁 体育大学 周边华联商厦 精装修        2000   20
4  怡美家园 4室1卫2厅 95平精装修 3800元/月        3800   25

Cleaning rental data...
Data types converted successfully


In [30]:
# Display the first few rows of the cleaned data
print("\nCleaned rental data:")
print(rental_data.head())
print("\nRental data summary:")
print(rental_data.describe())
print("\nNaN count per column:")
print(rental_data.isna().sum())

# Save the data to a CSV file
rental_data.to_csv('fang_rental_data_whole.csv', index=False)
print("\nRental data saved to 'fang_rental_data.csv'")



Cleaned rental data:
                    HouseName  MonthlyRent  Area
0  上地西里颂芳园 3室1卫1厅 70平精装修 2000         2000    15
1       13号地铁线上地站 清河站 嘉华大厦 华联         1200    18
2     上地附近精装修 准要事情说三遍免佣 免佣 免佣         2000    20
3     上地13号地铁 体育大学 周边华联商厦 精装修         2000    20
4  怡美家园 4室1卫2厅 95平精装修 3800元/月         3800    25

Rental data summary:
        MonthlyRent        Area
count    388.000000  388.000000
mean    6701.247423   69.693299
std     5005.644872   63.494908
min     1000.000000    6.000000
25%     2400.000000   18.000000
50%     5800.000000   45.000000
75%    11000.000000  119.250000
max    19000.000000  273.000000

NaN count per column:
HouseName      0
MonthlyRent    0
Area           0
dtype: int64

Rental data saved to 'fang_rental_data.csv'
