In [1]:
import re
import pandas as pd
from urllib.parse import urljoin
import json
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib
from time import sleep
import random
from bs4 import BeautifulSoup as bs

In [2]:
def urlGet(url, headers, max_try=3, sleep_time=5):
    '''
    self designed url get request using requests
    '''
    not_connected = True
    tried = 0
    while not_connected and tried < max_try:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                sleep(sleep_time)
            else:
                not_connected = False
            tried += 1
        except:
            sleep(sleep_time)
            tried += 1
    if tried == max_try:
        return None
    
    return response


def parse_source(source, prefix):
    data1 = bs(source, 'html.parser')
    parsed = []
    for one in data1.select('div.content__list > div'):
        tem = prefix.copy()
        
        try:
            # 第一列数据
            title = one.select_one('div > p > a').get_text().replace('\n', '').strip()
            # 小区名称
            name = title.split('·')[-1].split(' ')[0]
            # 整租/单租
            renting = title.split('·')[0]

            # 链接
            url = urljoin('https://tj.lianjia.com/zufang/', one.select_one('div > p > a')['href'])

            # 第二列数据
            a = one.select_one('.content__list--item--des').get_text().split('\n')
            a = [i.replace('/', '').strip() for i in a if i.strip() not in ['', '/']]
            if len(a) == 5:
                mianji, chaoxiang, huxing, louceng = a[1:]
                louceng = louceng.replace(' ', '')
            else:
                mianji, chaoxiang, huxing, louceng = a[1], '无', a[-1], '无' 

            # 第三列数据
            tag = one.select_one('p:nth-child(3)').get_text().replace('\n', ' ').strip()

            # 价格
            price = one.select_one('div > span').get_text().split(' ')
        except:
            continue
        
        tem.update({
            '小区名称' : name, '链接' : url, '租赁方式' : renting, '面积' : mianji, '房屋朝向' : chaoxiang,
            '户型' : huxing, '楼层' : louceng, '标签' : tag, '价格' : price[0], '价格单位' : price[1]
        })
        parsed.append(tem)
    return parsed


def get_count(response):
    soup = bs(response.text, 'html.parser')
    count = int(soup.select_one('.content__title--hl').get_text())
    return count


def get_max_page(response):
    return bs(response.text, 'html.parser').select_one('.content__pg')['data-totalpage']
    
    
def split_series(series, n):

    total_elements = len(series)
    elements_per_partition, remainder = divmod(total_elements, n)

    start = 0
    partitions = []

    for i in range(n):
        end = start + elements_per_partition + (1 if i < remainder else 0)
        partition = series[start:end]
        partitions.append(partition)
        start = end

    return partitions
            
       
def collect_data(url_list, headers, prefix):
    data = []
    
    def collect_single(url_list, header, prefix):
        
        nonlocal data
        
        for url in url_list:
            response = urlGet(url, header)
            if response == None:
                print('get failed : ', url)
                sleep(5)
                continue
            try:
                parsed = parse_source(response.text, prefix)
                data += parsed
            except:
                print('parsing failed : ', url)
            sleep(3)
            
        return None
    
    headers_count = len(headers)
    splitted_url_list = split_series(url_list, headers_count)

    with ThreadPoolExecutor(max_workers=headers_count) as executor:
        futures = [executor.submit(collect_single, splitted_url_list[i], headers[i], prefix) for i in range(headers_count)]

        for future in as_completed(futures):
            future.result()
    
    return data

In [3]:
def tianjin():
    global alldata
    global failed
    global headers
    
    tianjin_url = 'https://tj.lianjia.com/zufang/'
    tianjin_response = urlGet(tianjin_url, headers[0])
    if tianjin_response == None:
        print('Failed to get 天津')
        return None
    tianjin_soup = bs(tianjin_response.text, 'html.parser')
    districts = tianjin_soup.select_one('div.filter').find('ul', {'data-target' : 'area'}).find_all('a', {'rel' : None})
    
    for district in districts:
        district_name = district.get_text()
        print(f'\t Scraping 天津 {district_name}')
        district_url = urljoin(tianjin_url, district['href'])
        district_response = urlGet(district_url, headers[1])
        if district_response == None:
            print(f'\t {district_name} failed')
            sleep(3)
            continue
        district_soup = bs(district_response.text, 'html.parser')
        areas = district_soup.select_one('div.filter').find('ul', {'data-target' : 'area', 'class' : None}).find_all('a')[1:]
        sleep(3)
        
        for area in areas:
            area_name = area.get_text()
            print(f'\t\t Scraping 天津 {district_name} {area_name}')
            areat_url = urljoin(tianjin_url, area['href'])
            area_response = urlGet(areat_url, headers[1])
            if area_response == None:
                print(f'\t\t {area_name} failed')
                sleep(3)
                continue
            area_soup = bs(area_response.text, 'html.parser')
            sleep(3)
            if get_count(area_response) > 3000:
                print(get_count(area_response))
                print(f'\t\t {area_name} exceeding 3000')
                failed.append(areat_url)
                continue
            # 获取爬取列表
            try:
                max_page = int(get_max_page(area_response))
            except:
                print('get max page faild : ', areat_url)
                sleep(3)
                continue
            urls = [areat_url+f'pg{i+1}/' for i in range(max_page)]
            # 爬取数据
            prefix = {'城市' : '天津', '地区' : district_name, '区域' : area_name}
            alldata += collect_data(urls, headers, prefix)

In [4]:
headers = joblib.load('./headers')
alldata, failed = [], []

In [5]:
tianjin()

	 Scraping 天津 和平
		 Scraping 天津 和平 南营门街
		 Scraping 天津 和平 南市
		 Scraping 天津 和平 劝业场
		 Scraping 天津 和平 体育馆街
		 Scraping 天津 和平 小白楼
		 Scraping 天津 和平 新兴街
	 Scraping 天津 南开
		 Scraping 天津 南开 长虹街
		 Scraping 天津 南开 广开街
		 Scraping 天津 南开 鼓楼街
		 Scraping 天津 南开 华苑
		 Scraping 天津 南开 嘉陵道街
		 Scraping 天津 南开 水上公园街
		 Scraping 天津 南开 体育中心街
get failed :  https://tj.lianjia.com/zufang/tiyuzhongxinjie/pg12/
		 Scraping 天津 南开 王顶堤
		 Scraping 天津 南开 万兴街
		 Scraping 天津 南开 兴南街
		 Scraping 天津 南开 学府街
		 Scraping 天津 南开 向阳路
	 Scraping 天津 河西
		 Scraping 天津 河西 陈塘庄
		 Scraping 天津 河西 大营门
		 Scraping 天津 河西 东海街
		 Scraping 天津 河西 挂甲寺
		 Scraping 天津 河西 尖山
		 Scraping 天津 河西 柳林街
		 Scraping 天津 河西 马场街
		 Scraping 天津 河西 梅江
		 Scraping 天津 河西 天塔街
		 Scraping 天津 河西 桃园街
		 Scraping 天津 河西 新梅江
		 Scraping 天津 河西 下瓦房
		 Scraping 天津 河西 越秀路
		 Scraping 天津 河西 友谊路
	 Scraping 天津 河北
		 Scraping 天津 河北 光复道
		 Scraping 天津 河北 鸿顺里街
		 Scraping 天津 河北 江都路
		 Scraping 天津 河北 靖江路
		 Scraping 天津 河北 建昌道
		 Scraping 天津 河北 宁园
		 Scraping 天津 河北 铁东路
		 Sc

In [51]:
pd.DataFrame(alldata).sort_values(['地区', '区域']).to_csv('./天津市租房信息.csv')