In [1]:
import re
import pandas as pd
from urllib.parse import urljoin
import json
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib
from time import sleep
import random
from bs4 import BeautifulSoup as bs

In [2]:
def urlGet(url, headers, max_try=3, sleep_time=5):
    '''
    self designed url get request using requests
    '''
    not_connected = True
    tried = 0
    while not_connected and tried < max_try:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                sleep(sleep_time)
            else:
                not_connected = False
            tried += 1
        except:
            sleep(sleep_time)
            tried += 1
    if tried == max_try:
        return None
    
    return response


def parse_source(source, prefix):
    data = bs(source, 'html.parser')
    parsed = []
    for li in data.select('ul.listContent > li'):
        tem = prefix.copy()
        
        time = li.select_one('div.dealDate').get_text()
        
        title = li.select_one('div.title').get_text()
        name = title.split(' ')[0]
        area = title.split(' ')[-1]
        
        info = li.select_one('div.houseInfo').get_text()
        zhuangxiu = info.split(' | ')[-1]
        chaoxiang = info.split(' | ')[0]
        
        price = li.select_one('div.totalPrice').get_text()
        
        positionInfo = li.select_one('div.positionInfo').get_text()
        louceng = positionInfo.split(' ')[0]
        building = positionInfo.split(' ')[-1].split('年')[0]
        louxing = positionInfo.split(' ')[-1].split('年')[-1]
        
        unitPrice = li.select_one('div.unitPrice').get_text()
        try:
            dealCycleeInfo = li.select_one('div.dealCycleeInfo').get_text()
        except:
            dealCycleeInfo = None
            
        tem.update({'成交时间' : time, '小区名称' : name, '建筑面积' : area, '朝向' : chaoxiang, '装修' : zhuangxiu,
                       '成交价格' : price, '楼层' : louceng, '建成年份' : building, '建筑类型' : louxing,
                       '房屋类型' : info, '成交平米价格' : unitPrice, '交易周期' : dealCycleeInfo})
        parsed.append(tem)
    return parsed


def get_city_urls(header):
    '''
    get all city domains in lianjia.
    '''
    urls = []
    url = 'https://www.lianjia.com/city/'
    response = urlGet(url, headers=header)
    soup = bs(response.text, 'html.parser')
    for province in soup.select('div.city_province'):
        province_name = province.find('div', {'class' : 'city_list_tit c_b'}).get_text()
        cities = [{'省份' : province_name, '城市' : i.get_text(), 'url' : i['href']+'chengjiao/'} for i in province.find_all('a')]
        urls += cities
    return urls


def get_count(response):
    soup = bs(response.text, 'html.parser')
    count = int(soup.find('div', {'class' : 'total fl'}).span.get_text().split(' ')[1])
    return count


def get_max_page(response):
    soup = bs(response.text, 'html.parser')
    try:
        page_box = soup.find('div', {'class' : 'page-box house-lst-page-box'})
        max_page = json.loads(page_box['page-data'])['totalPage']
        return max_page
    except:
        return 0
    
    
def split_series(series, n):

    total_elements = len(series)
    elements_per_partition, remainder = divmod(total_elements, n)

    start = 0
    partitions = []

    for i in range(n):
        end = start + elements_per_partition + (1 if i < remainder else 0)
        partition = series[start:end]
        partitions.append(partition)
        start = end

    return partitions


def collect_city(city_dic, headers):
    city_data, failed = [], []
    city_name = city_dic['省份'] + city_dic['城市']
    print(f'Scraping {city_name}')
    city_url = city_dic['url']
    city_response = urlGet(city_url, headers[0])
    if city_response == None:
        print('connection Failed : ', city_url)
        return None
    # 获取区域url
    city_response = bs(city_response.text, 'html.parser')
    area_dic = city_response.select_one('div.position').find_all('a', {'class' : None})
    
    # 访问各个区域
    for i, area in enumerate(area_dic):
        print(f'\t Scraping {area.get_text()}')
        prefix = {'省份' : city_dic['省份'], '城市' : city_dic['城市'], '地区' : area.get_text()}
        area_url = city_url.replace('/chengjiao/', '')+area['href']
        area_response = urlGet(area_url, headers[min(i, len(headers)-1)])
        if area_response == None:
            print('Area connection failed : ', area_url)
            continue
        # 检查数据是否为零
        try:
            count = get_count(area_response)
        except:
            print(f'\t {area.get_text()} count failed')
            continue
        if get_count(area_response) == 0:
            print(f'\t {area.get_text()} has 0 results')
            continue
            
        # 加入地区筛选
        area_soup = bs(area_response.text, 'html.parser')
        place_options = area_soup.find('div', {'data-role' : 'ershoufang'}).select('div:nth-child(2) > a')
        
        for place in place_options:
            prefix['区域'] = place.get_text()
            place_url = urljoin(city_url, place['href'])
            place_response = urlGet(place_url, headers[1])
            
            if place_response == None:
                print('Place Selection failed : ', place_url)
                continue
            if get_count(place_response) == 0:
                continue
            elif get_count(place_response) <= 3000:
                max_page = get_max_page(place_response)
                if max_page == 0:
                    print('Max_page = 0, some error happened : ', place_url)
                    continue

                page_list = [f'pg{i+1}/' for i in range(max_page)]
                url_list = [place_url+i for i in page_list]
                
                city_data.append({'URLs' : url_list}.update(prefix))

            elif get_count(place_response) > 3000:
                # 加入楼层筛选
                price_soup = bs(place_response.text, 'html.parser')
                floor_options = price_soup.select_one('div.list-more').find('dt', text='楼层').find_parent('dl').find_all('a', {'class' : ''})

                # 访问加入楼层筛选的网址
                for floor_option in floor_options:
                    floor_url = city_url.replace('/chengjiao/', '') + floor_option['href']
                    floor_response = urlGet(floor_url, headers[2])
                    if floor_response == None:
                        print('Floor Selection failed : ', floor_url)
                        continue
                    if get_count(floor_response) == 0:
                        continue
                    elif get_count(floor_response) > 3000:
                        failed.append([prefix, floor_url])
                    else:
                        max_page = get_max_page(floor_response)
                        if max_page == 0:
                            print('Max_page = 0, some error happened : ', floor_url)
                            continue
                        page_list_f = [f'pg{i+1}/' for i in range(max_page)]
                        url_list_f = [floor_url[:-1] + i for i in page_list_f]
                        city_data.append({'URLs' : url_list_f}.update(prefix))
            else:
                failed.append(place_url)
                
    return city_data, failed
            
        
def collect_data(url_list, headers, prefix):
    data = []
    
    def collect_single(url_list, header, prefix):
        
        nonlocal data
        
        for url in url_list:
            response = urlGet(url, header)
            if response == None:
                print('get failed : ', url)
                sleep(5)
                continue
            try:
                parsed = parse_source(response.text, prefix)
                data += parsed
            except:
                print('parsing failed : ', url)
            sleep(3)
            
        return None
    
    headers_count = len(headers)
    splitted_url_list = split_series(url_list, headers_count)

    with ThreadPoolExecutor(max_workers=headers_count) as executor:
        futures = [executor.submit(collect_single, splitted_url_list[i], headers[i], prefix) for i in range(headers_count)]

        for future in as_completed(futures):
            future.result()
    
    return data

In [3]:
h = {'Cookie' : 'lianjia_uuid=e989970c-0c31-409d-b1e0-d60337bd4e19; _jzqa=1.549451964023302340.1707017879.1708328373.1708330021.25; _jzqx=1.1707017879.1708327003.6.jzqsr=google%2Ecom%2Ehk|jzqct=/.jzqsr=google%2Ecom%2Ehk|jzqct=/; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218d7231c2b662-0813a179f5d5fd8-b7d2534-2359296-18d7231c2b724e%22%2C%22%24device_id%22%3A%2218d7231c2b662-0813a179f5d5fd8-b7d2534-2359296-18d7231c2b724e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wybeijing%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; _smt_uid=65bf0698.376c5108; _ga=GA1.2.1608787457.1707017887; _ga_KJTRWRHDL1=GS1.2.1708327016.8.0.1708327016.0.0.0; _ga_QJN1VP0CMS=GS1.2.1708327016.8.0.1708327016.0.0.0; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1707994132,1708215718,1708327009,1708328411; ftkrc_=49748abe-0c32-45f7-bfc6-6d9b73d0426a; lfrc_=8d591745-3d64-42d3-9b84-e6a040e92096; _ga_WLZSQZX7DE=GS1.2.1708327122.10.1.1708327302.0.0.0; _ga_TJZVFLS7KV=GS1.2.1708327122.10.1.1708327302.0.0.0; _ga_0P06DN4FCM=GS1.2.1707018050.1.1.1707018632.0.0.0; _ga_VC1LW39EP9=GS1.2.1707040753.3.1.1707040799.0.0.0; _ga_RCTBRFLNVS=GS1.2.1707047045.1.0.1707047045.0.0.0; _ga_PV625F3L95=GS1.2.1707123953.1.0.1707123953.0.0.0; _ga_8EKBN6G64V=GS1.2.1707132105.1.0.1707132105.0.0.0; _jzqy=1.1707211940.1707211940.1.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6.-; _ga_C4R21H79WC=GS1.2.1707261786.1.1.1707261841.0.0.0; crosSdkDT2019DeviceId=-4up34r-524euv-u2tkr8ohxhkbok0-kpsrp7iox; _ga_049GGDBYWQ=GS1.2.1708215720.1.1.1708215731.0.0.0; _ga_B3G62E46BE=GS1.2.1708215720.1.1.1708215731.0.0.0; select_city=310000; lianjia_ssid=b0d2514a-b185-45c6-8367-bf16df3ba469; _jzqb=1.3.10.1708330021.1; _jzqckmp=1; _gid=GA1.2.542670836.1708327016; _qzja=1.450175549.1708327235425.1708328385010.1708330020551.1708330028139.1708330031088.0.0.0.41.3; _qzjb=1.1708327235425.41.0.0.0; _qzjto=41.3.0; _ga_GVYN2J1PCG=GS1.2.1708327243.1.1.1708328611.0.0.0; _ga_LRLL77SF11=GS1.2.1708327243.1.1.1708328611.0.0.0; login_ucid=2000000260014008; lianjia_token=2.0012e95c9178f34ec5034475a0b1452696; lianjia_token_secure=2.0012e95c9178f34ec5034475a0b1452696; security_ticket=s654PhUipCMIE3BV5IVA112adzyXMZsIjqC+k8kO6ZcJMKaBT4wRonh/xMY/sKZsj9HrV/z2YvovNeuUiP7d3TYXpG4kXtxZsxAdvbkkmIK83xlEU2ImxR7nBB5yDG08nE/FpjOL8tsiMXEfPnifNMej40KzlLCjfnavEyV90HE=; _ga_DB196GBT1C=GS1.2.1708327940.1.1.1708327959.0.0.0; _qzjc=1; _jzqc=1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiNjk2YTEyNDgzYzUzZjk1MjJjNjk0ODZmNjRkZGFjY2EwMGRhMzdmNTY1MjE0MmY0ZDZjMGUyMmViNWY2ODgwOWQxY2NmODlmYWY2MmQ0OGQyZDliMmExOTc2MzQxOTVhYWNmMDA1OTU5MzA4N2FmYmU1ODYwMTBhZjdlZmQ4NTcwMzI1ODM0MWE1YmQxMjg4NDkyMWQxNDc4MjJhODE2MWZhZjIxMjgyYzBlNmQ2OGY1MjgxMDcwMDRlNWIyMTUyNTYzYTY5OGVkZTI4NDQ3ZDU3ZmUzZGI3OGY4MjkzYWNlOWFjZDYwN2U2NDI3NTYzODQ3MDhlMWNhZWM4MzY2NlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzNmY2NjliMVwifSIsInIiOiJodHRwczovL3NoLmxpYW5qaWEuY29tL2NoZW5namlhby8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='}

In [4]:
headers = [h for _ in range(1000)]
city_dics = joblib.load('./城市网址')

In [5]:
all_data, all_failed = [], []
for city_dic in city_dics:
    data, failed = collect_city(city_dic, headers)
    all_data += data
    all_failed += failed

Scraping 安徽安庆
	 Scraping 大观区
	 Scraping 太湖县
	 太湖县 has 0 results
	 Scraping 宜秀区
	 Scraping 宿松县
	 宿松县 has 0 results
	 Scraping 岳西县
	 岳西县 has 0 results
	 Scraping 怀宁县
	 Scraping 望江县
	 望江县 has 0 results
	 Scraping 桐城市
	 桐城市 has 0 results
	 Scraping 潜山市
	 Scraping 迎江区
Scraping 安徽阜阳
	 Scraping 临泉县
	 Scraping 太和县
	 太和县 has 0 results
	 Scraping 界首市
	 界首市 has 0 results
	 Scraping 阜南县
	 阜南县 has 0 results
	 Scraping 颍上县
	 Scraping 颍东区
	 Scraping 颍州区
	 Scraping 颍泉区
Scraping 安徽合肥
	 Scraping 包河
	 Scraping 巢湖市
	 Scraping 庐江县
	 庐江县 has 0 results
	 Scraping 空港经济示范区
	 Scraping 蜀山
	 Scraping 庐阳
	 Scraping 瑶海
	 Scraping 政务
	 Scraping 滨湖新区
	 Scraping 经开
	 Scraping 高新
	 Scraping 新站
	 Scraping 肥东
	 Scraping 肥西
	 Scraping 长丰
Scraping 安徽马鞍山
	 Scraping 博望区
	 博望区 has 0 results
	 Scraping 含山县
	 含山县 has 0 results
	 Scraping 和县
	 和县 has 0 results
	 Scraping 当涂县
	 Scraping 花山区
	 Scraping 雨山区
Scraping 安徽芜湖
	 Scraping 三山区
	 Scraping 南陵县
	 Scraping 弋江区
	 Scraping 无为市
	 Scraping 繁昌区
	 Scraping 经济开发区
	 Scraping 湾沚区
	 Scr

	 大名县 has 0 results
	 Scraping 武安市
	 武安市 has 0 results
	 Scraping 魏县
	 魏县 has 0 results
	 Scraping 广平县
	 广平县 has 0 results
	 Scraping 曲周县
	 曲周县 has 0 results
	 Scraping 丛台区
	 Scraping 复兴区
	 Scraping 磁县
	 Scraping 峰峰矿区
	 峰峰矿区 has 0 results
	 Scraping 邯山区
	 Scraping 肥乡区
	 Scraping 临漳县
	 Scraping 涉县
	 涉县 has 0 results
	 Scraping 永年区
	 永年区 has 0 results
	 Scraping 鸡泽县
	 鸡泽县 has 0 results
	 Scraping 成安县
	 Scraping 邱县
	 邱县 has 0 results
Scraping 河北廊坊
	 Scraping 燕郊
	 Scraping 香河
	 Scraping 广阳
	 Scraping 安次
	 Scraping 廊坊经济技术开发区
	 Scraping 固安
	 Scraping 大厂
	 Scraping 永清
	 永清 has 0 results
Scraping 河北石家庄
	 Scraping 裕华
	 Scraping 长安
	 Scraping 桥西
	 Scraping 新华
	 Scraping 开发区
	 Scraping 正定
	 Scraping 鹿泉
	 Scraping 栾城
	 Scraping 藁城
	 Scraping 元氏县
	 Scraping 辛集市
	 Scraping 灵寿县
	 灵寿县 has 0 results
	 Scraping 平山
	 Scraping 井陉县
	 井陉县 has 0 results
	 Scraping 赵县
	 赵县 has 0 results
	 Scraping 深泽县
	 深泽县 has 0 results
	 Scraping 晋州市
	 Scraping 高邑县
	 高邑县 has 0 results
	 Scraping 新乐
	 Scraping 井陉矿区
	 井陉矿区 ha

	 崇义县 has 0 results
	 Scraping 石城县
	 石城县 has 0 results
	 Scraping 龙南市
	 龙南市 has 0 results
Scraping 江西九江
	 Scraping 八里湖新区
	 Scraping 经济开发区
	 Scraping 修水县
	 修水县 has 0 results
	 Scraping 共青城市
	 共青城市 has 0 results
	 Scraping 庐山市
	 Scraping 彭泽县
	 彭泽县 has 0 results
	 Scraping 德安县
	 Scraping 柴桑区
	 Scraping 武宁县
	 武宁县 has 0 results
	 Scraping 永修县
	 永修县 has 0 results
	 Scraping 浔阳区
	 Scraping 湖口县
	 湖口县 has 0 results
	 Scraping 濂溪区
	 Scraping 瑞昌市
	 Scraping 都昌县
	 都昌县 has 0 results
Scraping 江西吉安
	 Scraping 吉州区
	 Scraping 青原区
	 Scraping 吉安县
	 Scraping 吉水县
	 Scraping 泰和县
	 Scraping 安福县
	 Scraping 永新县
	 Scraping 永丰县
	 Scraping 遂川县
	 Scraping 峡江县
	 峡江县 has 0 results
	 Scraping 新干县
Scraping 江西南昌
	 Scraping 东湖区
	 Scraping 南昌县
	 Scraping 安义县
	 安义县 has 0 results
	 Scraping 新建区
	 Scraping 湾里区
	 Scraping 红谷滩
	 Scraping 西湖区
	 Scraping 进贤县
	 Scraping 青云谱区
	 Scraping 青山湖区
	 Scraping 高新区
	 Scraping 经开区
Scraping 江西上饶
	 Scraping 广信区
	 Scraping 信州区
	 Scraping 广丰区
	 Scraping 玉山县
Scraping 江苏常州
	 Scraping 武进区
	 Scrap

	 曹县 count failed
	 Scraping 巨野县
	 巨野县 count failed
	 Scraping 东明县
	 东明县 count failed
	 Scraping 鄄城县
	 鄄城县 count failed
	 Scraping 郓城县
	 郓城县 count failed
Scraping 山东济南
	 Scraping 历下
	 Scraping 莱芜区
	 莱芜区 has 0 results
	 Scraping 市中
	 Scraping 天桥
	 Scraping 历城
	 Scraping 槐荫
Place Selection failed :  https://jn.lianjia.com/chengjiao/xishichang/
	 Scraping 高新
	 Scraping 济阳
	 Scraping 商河
	 商河 has 0 results
	 Scraping 平阴
	 平阴 has 0 results
	 Scraping 章丘
	 Scraping 长清
Scraping 山东济宁
	 Scraping 任城区
	 Scraping 兖州区
	 Scraping 嘉祥县
	 嘉祥县 has 0 results
	 Scraping 微山县
	 微山县 has 0 results
	 Scraping 曲阜市
	 曲阜市 has 0 results
	 Scraping 梁山县
	 梁山县 has 0 results
	 Scraping 汶上县
	 Scraping 泗水县
	 泗水县 has 0 results
	 Scraping 邹城市
	 Scraping 金乡县
	 金乡县 has 0 results
	 Scraping 鱼台县
	 鱼台县 has 0 results
Scraping 山东临沂
	 Scraping 兰山区
	 Scraping 河东区
	 Scraping 罗庄区
	 Scraping 莒南县
	 莒南县 has 0 results
	 Scraping 沂水县
	 Scraping 临沭县
	 临沭县 has 0 results
	 Scraping 平邑县
	 平邑县 has 0 results
	 Scraping 费县
	 费县 has 0 results
	 S

	 1号线 count failed
	 Scraping 2号线
	 2号线 count failed
	 Scraping 3号线
	 3号线 count failed
	 Scraping 4号线
	 4号线 count failed
	 Scraping 5号线
	 5号线 count failed
	 Scraping 6号线
	 6号线 count failed
	 Scraping 9号线
	 9号线 count failed
	 Scraping 10号线
	 10号线 count failed
Scraping 新疆乌鲁木齐
	 Scraping 乌鲁木齐县
	 Scraping 天山区
	 Scraping 经济技术开发区（头屯河区）
	 Scraping 新市区
	 Scraping 水磨沟区
	 Scraping 沙依巴克区
	 Scraping 米东区
	 Scraping 达坂城区
	 达坂城区 has 0 results
Scraping 云南大理
	 Scraping 凤仪
	 Scraping 古城
	 Scraping 海东
	 Scraping 经开区
	 Scraping 满江片区
	 Scraping 市区
	 Scraping 下关北区
Scraping 云南昆明
	 Scraping 五华
	 Scraping 盘龙
	 Scraping 官渡
	 Scraping 西山
	 Scraping 呈贡
	 Scraping 晋宁
	 Scraping 嵩明
	 Scraping 东川
	 东川 has 0 results
	 Scraping 富民
	 Scraping 宜良
	 宜良 has 0 results
	 Scraping 石林
	 石林 has 0 results
	 Scraping 寻甸
	 寻甸 has 0 results
	 Scraping 禄劝
	 禄劝 has 0 results
	 Scraping 安宁
Scraping 云南西双版纳
	 Scraping 景洪市
	 Scraping 勐海县
	 勐海县 has 0 results
	 Scraping 勐腊县
	 勐腊县 has 0 results
Scraping 浙江杭州
	 Scraping 西湖
	 Scraping 钱塘区
	 

In [10]:
joblib.dump(all_failed, 'Fail列表')

['Fail列表']