In [1]:
import requests
from lxml import etree
import pandas as pd
import threading
import time
import urllib3

In [2]:
# Suppress InsecureRequestWarning when SSL verification is disabled
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# List of district URLs to scrape
district_urls = [
# 'https://sh.lianjia.com/zufang/jingan/',
# 'https://sh.lianjia.com/zufang/xuhui/',
# 'https://sh.lianjia.com/zufang/huangpu/',
# 'https://sh.lianjia.com/zufang/changning/',
# 'https://sh.lianjia.com/zufang/putuo/',
# 'https://sh.lianjia.com/zufang/pudong/',
# 'https://sh.lianjia.com/zufang/baoshan/',
'https://sh.lianjia.com/zufang/hongkou/',
'https://sh.lianjia.com/zufang/yangpu/',
# 'https://sh.lianjia.com/zufang/minhang/',
# 'https://sh.lianjia.com/zufang/jinshan/',
# 'https://sh.lianjia.com/zufang/jiading/',
# 'https://sh.lianjia.com/zufang/chongming/',
# 'https://sh.lianjia.com/zufang/fengxian/',
# 'https://sh.lianjia.com/zufang/songjiang/',
# 'https://sh.lianjia.com/zufang/qingpu/'
]


In [3]:
# Dictionary to map URLs to more readable district names (optional)
district_names = {
    'jingan': 'Jing\'an',
    'xuhui': 'Xuhui',
    'huangpu': 'Huangpu',
    'changning': 'Changning',
    'putuo': 'Putuo',
    'pudong': 'Pudong',
    'baoshan': 'Baoshan',
    'hongkou': 'Hongkou',
    'yangpu': 'Yangpu',
    'minhang': 'Minhang',
    'jinshan': 'Jinshan',
    'jiading': 'Jiading',
    'chongming': 'Chongming',
    'fengxian': 'Fengxian',
    'songjiang': 'Songjiang',
    'qingpu': 'Qingpu'
}

In [4]:
# Function to get the total number of houses for a given URL
def get_total_houses(tree):
    total_houses_xpath = "//p[@class='content__title']/span[@class='content__title--hl']/text()"
    total_houses = tree.xpath(total_houses_xpath)
    return int(total_houses[0].strip()) if total_houses else 0

# Function to add page numbers to district URLs
def generate_paginated_urls(url, total_houses, houses_per_page=30):
    total_pages = (total_houses + houses_per_page - 1) // houses_per_page
    return [f"{url}pg{page}/" for page in range(1, total_pages + 1)]

# Function to parse a single page URL and extract house details
def parse_page(url, district_name):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    response = requests.get(url, headers=headers, verify=False)
    tree = etree.HTML(response.content)

    house_data = []
    house_elements = tree.xpath("//div[@class='content__list--item']")
    for house in house_elements:
        data = {
            'district': district_name,
            'title': house.xpath(".//p[@class='content__list--item--title']/a/text()")[0].strip() if house.xpath(".//p[@class='content__list--item--title']/a/text()") else '',
            'link': 'https://sh.lianjia.com' + house.xpath(".//p[@class='content__list--item--title']/a/@href")[0] if house.xpath(".//p[@class='content__list--item--title']/a/@href") else '',
            'location': ' '.join(house.xpath(".//p[@class='content__list--item--des']/a/text()")) if house.xpath(".//p[@class='content__list--item--des']/a/text()") else '',
            'size': house.xpath(".//p[@class='content__list--item--des']/text()")[1].strip() if len(house.xpath(".//p[@class='content__list--item--des']/text()")) > 1 else '',
            'orientation': house.xpath(".//p[@class='content__list--item--des']/text()")[2].strip() if len(house.xpath(".//p[@class='content__list--item--des']/text()")) > 2 else '',
            'rooms': house.xpath(".//p[@class='content__list--item--des']/text()")[3].strip() if len(house.xpath(".//p[@class='content__list--item--des']/text()")) > 3 else '',
            'floor': house.xpath(".//p[@class='content__list--item--des']/span[@class='hide']/text()")[0].strip() if house.xpath(".//p[@class='content__list--item--des']/span[@class='hide']/text()") else '',
            'tags': ', '.join(house.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()")),
            'brand': house.xpath(".//p[@class='content__list--item--brand oneline']/span[@class='brand']/text()")[0].strip() if house.xpath(".//p[@class='content__list--item--brand oneline']/span[@class='brand']") else '',
            'maintenance_time': house.xpath(".//p[@class='content__list--item--brand oneline']/span[@class='content__list--item--time oneline']/text()")[0].strip() if house.xpath(".//p[@class='content__list--item--brand oneline']/span[@class='content__list--item--time oneline']") else '',
            'price': house.xpath(".//span[@class='content__list--item-price']/em/text()")[0].strip() + '元/月' if house.xpath(".//span[@class='content__list--item-price']/em/text()") else ''
        }
        house_data.append(data)
    return house_data

# Main function to orchestrate scraping for each district
def scrape_districts():
    district_dataframes = {}
    for url in district_urls:
        district_key = url.split('/')[-2]
        district_name = district_names.get(district_key, 'Unknown')
        print(f"Scraping district: {district_name}")
        initial_response = requests.get(url, verify=False)
        tree = etree.HTML(initial_response.content)
        total_houses = get_total_houses(tree)
        paginated_urls = generate_paginated_urls(url, total_houses)
        all_data = []
        for page_url in paginated_urls:
            print(f"Scraping pages: {page_url}")
            all_data.extend(parse_page(page_url, district_name))
            time.sleep(3)  # Increased sleep time to 3 seconds to mitigate risk of blocking
        district_df = pd.DataFrame(all_data)
        district_dataframes[f'df_zufang_{district_key}'] = district_df
        district_df.to_csv(f"{district_key}_zufang_data.csv", index=False)
        print(f"Data for {district_name} saved to {district_key}_zufang_data.csv")
    return district_dataframes

In [5]:
# Execute scraping
district_dfs = scrape_districts()
# Access specific district DataFrame if needed
# print(district_dfs['df_zufang_jingan'].head())

Scraping district: Hongkou
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg1/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg2/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg3/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg4/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg5/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg6/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg7/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg8/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg9/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg10/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg11/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg12/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg13/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg14/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg15/
Scraping pages: https://sh.lianjia.com/zufang/hongkou/pg16/
Scraping pages: https:

In [7]:
# # Use the function to scrape data and print results
# final_df = scrape_districts(district_urls)
# print(final_df)