In [43]:
#from websocket._core import WebSocketApp
# websocket-client
from selenium import webdriver
from time import sleep
import pandas as pd
from io import StringIO
from functools import partial
import matplotlib.pyplot as plt
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common import NoSuchElementException,StaleElementReferenceException
from matplotlib import style

print(plt.style.available)
%matplotlib inline

driver=webdriver.Edge()

url='https://esf.fang.com/house-a015277-b022/'
driver.get(url)

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'petroff10', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']


In [44]:
# Lists to store the scraped data
house_names = []
total_prices = []
areas = []
prices_per_sqm = []

In [45]:
# Function to extract data from a single page
def scrape_page(driver):
    # Wait for the listings to load
    sleep(2)
    
    # Find all house listing elements
    try:
        # Using the shop_list_4 class from your HTML screenshot
        listings = driver.find_elements(By.CSS_SELECTOR, ".shop_list.shop_list_4 dl")
        print(f"Found {len(listings)} listings on this page")
        
        for listing in listings:
            try:
                # Extract house name (title)
                try:
                    title_element = listing.find_element(By.CSS_SELECTOR, "span.tit_shop")
                    house_name = title_element.text.strip()
                    house_names.append(house_name)
                    print(f"House name: {house_name}")
                except NoSuchElementException:
                    # Try alternative selector
                    title_element = listing.find_element(By.CSS_SELECTOR, "h4 a")
                    house_name = title_element.text.strip()
                    house_names.append(house_name)
                    print(f"House name (alt): {house_name}")
                
                # Extract total price and price per square meter
                try:
                    # Get all price-related elements
                    price_elements = listing.find_elements(By.CSS_SELECTOR, "dd.price_right, dd.price_right_m")
                    
                    # First element should be total price (e.g., 790万)
                    if len(price_elements) > 0:
                        total_price = price_elements[0].text.strip()
                        if '万' in total_price:
                            total_price = total_price.replace('万', '')
                        total_prices.append(total_price)
                        print(f"Total price: {total_price}")
                    else:
                        total_prices.append(None)
                        print("Total price not found")
                    
                    # Second element should be price per square meter (e.g., 161554元/㎡)
                    if len(price_elements) > 1:
                        price_per_sqm = price_elements[1].text.strip()
                        if '元/㎡' in price_per_sqm:
                            price_per_sqm = price_per_sqm.replace('元/㎡', '')
                        elif '元/m²' in price_per_sqm:
                            price_per_sqm = price_per_sqm.replace('元/m²', '')
                        prices_per_sqm.append(price_per_sqm)
                        print(f"Price per sqm: {price_per_sqm}")
                    else:
                        prices_per_sqm.append(None)
                        print("Price per sqm not found")
                    
                except NoSuchElementException as e:
                    print(f"Price element not found: {e}")
                    total_prices.append(None)
                    prices_per_sqm.append(None)
                
                # Extract area from basic information section
                try:
                    # First try to find area in the main info section with pattern like "48.9m²"
                    info_texts = listing.find_elements(By.CSS_SELECTOR, "p.tel_shop, span.tel_shop")
                    area = None
                    
                    for element in info_texts:
                        element_text = element.text.strip()
                        print(f"Checking element text: {element_text}")
                        # Look for number followed by m² or ㎡
                        if ("m²" in element_text or "㎡" in element_text) and any(c.isdigit() for c in element_text):
                            # Extract the number part
                            import re
                            match = re.search(r"(\d+\.?\d*)(?:m²|㎡)", element_text)
                            if match:
                                area = match.group(1)
                                break
                    
                    # If area not found yet, check specific elements that might contain just the number
                    if area is None:
                        # Try to find the area text directly from spans or elements containing basic info
                        basic_info_elements = listing.find_elements(By.CSS_SELECTOR, ".tel_shop")
                        for element in basic_info_elements:
                            if any(c.isdigit() for c in element.text):
                                print(f"Potential area element: {element.text}")
                                # Look for patterns like "48.9m²" or just "48.9"
                                import re
                                match = re.search(r"(\d+\.?\d*)(?:m²|㎡)?", element.text)
                                if match:
                                    area = match.group(1)
                                    break
                    
                    areas.append(area)
                    print(f"Area extracted: {area}")
                except Exception as e:
                    print(f"Error extracting area: {e}")
                    areas.append(None)
                
            except Exception as e:
                print(f"Error processing listing: {e}")
                if len(house_names) < len(total_prices) + 1:
                    house_names.append(None)
                if len(total_prices) < len(house_names):
                    total_prices.append(None)
                if len(areas) < len(house_names):
                    areas.append(None)
                if len(prices_per_sqm) < len(house_names):
                    prices_per_sqm.append(None)
    
    except Exception as e:
        print(f"Error finding listings on page: {e}")
    
    # Make sure all lists have the same length
    max_len = max(len(house_names), len(total_prices), len(areas), len(prices_per_sqm))
    while len(house_names) < max_len:
        house_names.append(None)
    while len(total_prices) < max_len:
        total_prices.append(None)
    while len(areas) < max_len:
        areas.append(None)
    while len(prices_per_sqm) < max_len:
        prices_per_sqm.append(None)
        
    return len(listings) > 0


In [46]:
# Loop through the first 10 pages
for page_num in range(1, 11):
    print(f"\n=== Scraping page {page_num} ===")
    
    if page_num > 1:
        # Navigate to the next page - try clicking the next page button
        try:
            # First try to find and click the next page button
            next_page = driver.find_element(By.CLASS_NAME, 'nextp')
            driver.execute_script("arguments[0].scrollIntoView();", next_page)
            sleep(1)
            next_page.click()
            sleep(2)
        except NoSuchElementException:
            # If button not found, try direct URL
            try:
                next_page_url = f"{url}i3{page_num}/"
                print(f"Navigating to: {next_page_url}")
                driver.get(next_page_url)
            except Exception as e:
                print(f"Error navigating to page {page_num}: {e}")
                break
    
    # Scrape the current page
    if not scrape_page(driver):
        print(f"No listings found on page {page_num}. Stopping.")
        break
    
    # Wait between requests to avoid being blocked
    sleep(3)



=== Scraping page 1 ===
Found 60 listings on this page
House name: 电梯板楼 中间层两年前精装两居 直接入住 南北通透 房源充足
Total price: 710
81403元/㎡
Price per sqm not found
Checking element text: 2室1厅 | 87.22㎡ |中层 （共9层） | 南北向 | 2001年建 |王现伟
Area extracted: 87.22
House name: 3室1厅上地东里三区
Total price: 946
127837元/㎡
Price per sqm not found
Checking element text: 3室1厅 | 74㎡ |顶层 （共6层） | 南北向 | 1998年建 |李浩南
Area extracted: 74
House name: 电梯中高层 900万左右双卫三居室 三卧室朝南 满五年房源全
Total price: 920
74796元/㎡
Price per sqm not found
Checking element text: 3室2厅 | 123㎡ |中层 （共11层） | 南北向 | 2001年建 |王现伟
Area extracted: 123
House name: 1室1厅上地东里三区
Total price: 790
161554元/㎡
Price per sqm not found
Checking element text: 1室1厅 | 48.9㎡ |高层 （共5层） | 西南向 | 1998年建 |李浩南
Area extracted: 48.9
House name: 急售上地书包房,电梯高层,上地佳园大平层四居,采光视野超好
Total price: 2530
127578元/㎡
Price per sqm not found
Checking element text: 4室2厅 | 198.31㎡ |高层 （共16层） | 南北向 | 2004年建 |王耀佐
Area extracted: 198.31
House name: 上地大平层4居 书包房 上地西里 位置安静,南北通透精装修
Total price: 1650
91301元/㎡
Price per 

In [42]:
driver.quit()

In [47]:
# After scraping all pages, process the collected data
print("\nProcessing collected data...")

# Create a DataFrame with the collected data
housing_data = pd.DataFrame({
    'HouseName': house_names,
    'TotalPrice': total_prices,
    'Area': areas,
    'PricePerSqm': prices_per_sqm
})

# Display data before cleaning to verify
print("\nRaw data:")
print(housing_data.head())



Processing collected data...

Raw data:
                        HouseName       TotalPrice    Area PricePerSqm
0  电梯板楼 中间层两年前精装两居 直接入住 南北通透 房源充足    710\n81403元/㎡   87.22        None
1                      3室1厅上地东里三区   946\n127837元/㎡      74        None
2  电梯中高层 900万左右双卫三居室 三卧室朝南 满五年房源全    920\n74796元/㎡     123        None
3                      1室1厅上地东里三区   790\n161554元/㎡    48.9        None
4   急售上地书包房,电梯高层,上地佳园大平层四居,采光视野超好  2530\n127578元/㎡  198.31        None


In [48]:
# Process the TotalPrice column to separate total price and price per sqm
def process_price_data(row):
    if pd.isna(row['TotalPrice']):
        return row
    
    # Check if total price contains newline (indicating it has both values)
    if isinstance(row['TotalPrice'], str) and '\n' in row['TotalPrice']:
        parts = row['TotalPrice'].split('\n')
        row['TotalPrice'] = parts[0].strip()
        
        # If PricePerSqm is missing, use the second part from TotalPrice
        if pd.isna(row['PricePerSqm']) and len(parts) > 1:
            price_per_sqm = parts[1].strip()
            if '元/㎡' in price_per_sqm:
                price_per_sqm = price_per_sqm.replace('元/㎡', '')
            elif '元/m²' in price_per_sqm:
                price_per_sqm = price_per_sqm.replace('元/m²', '')
            row['PricePerSqm'] = price_per_sqm
    
    return row

In [49]:
# Apply the processing function
housing_data = housing_data.apply(process_price_data, axis=1)
housing_data

Unnamed: 0,HouseName,TotalPrice,Area,PricePerSqm
0,电梯板楼 中间层两年前精装两居 直接入住 南北通透 房源充足,710,87.22,81403
1,3室1厅上地东里三区,946,74,127837
2,电梯中高层 900万左右双卫三居室 三卧室朝南 满五年房源全,920,123,74796
3,1室1厅上地东里三区,790,48.9,161554
4,"急售上地书包房,电梯高层,上地佳园大平层四居,采光视野超好",2530,198.31,127578
...,...,...,...,...
595,2室1厅上地东里三区,820,59.5,137815
596,2室1厅上地东里一区,727,46.98,154746
597,"商圈力荐,南北向大三居,可做四居,满五,中间层,自住装修",1930,161.79,119290
598,2室1厅上地东里三区,820,59.5,137815


In [50]:
# Clean and convert data types
print("\nCleaning data...")
try:
    housing_data['TotalPrice'] = pd.to_numeric(housing_data['TotalPrice'], errors='coerce')
    housing_data['Area'] = pd.to_numeric(housing_data['Area'], errors='coerce')
    housing_data['PricePerSqm'] = pd.to_numeric(housing_data['PricePerSqm'], errors='coerce')
    print("Data types converted successfully")
except Exception as e:
    print(f"Error converting data types: {e}")

# Display the first few rows of the cleaned data
print("\nCleaned data:")
print(housing_data.head())
print("\nData summary:")
print(housing_data.describe())
print("\nNaN count per column:")
print(housing_data.isna().sum())

# Save the data to a CSV file
housing_data.to_csv('fang_housing_data_whole.csv', index=False)
print("\nData saved to 'fang_housing_data.csv'")


Cleaning data...
Data types converted successfully

Cleaned data:
                        HouseName  TotalPrice    Area  PricePerSqm
0  电梯板楼 中间层两年前精装两居 直接入住 南北通透 房源充足         710   87.22        81403
1                      3室1厅上地东里三区         946   74.00       127837
2  电梯中高层 900万左右双卫三居室 三卧室朝南 满五年房源全         920  123.00        74796
3                      1室1厅上地东里三区         790   48.90       161554
4   急售上地书包房,电梯高层,上地佳园大平层四居,采光视野超好        2530  198.31       127578

Data summary:
        TotalPrice        Area    PricePerSqm
count   600.000000  600.000000     600.000000
mean   1127.530000  112.537017  106346.980000
std     454.599476   51.240943   25266.586011
min     350.000000   41.070000   57500.000000
25%     820.000000   71.000000   84057.000000
50%    1046.500000  107.000000  105347.000000
75%    1298.000000  146.100000  124908.000000
max    3500.000000  314.710000  162466.000000

NaN count per column:
HouseName      0
TotalPrice     0
Area           0
PricePerSqm    0
dtype: int6