-
Notifications
You must be signed in to change notification settings - Fork 1
/
pyspider_HZ.py
124 lines (114 loc) · 5.7 KB
/
pyspider_HZ.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from pyspider.libs.base_handler import *
from my import My
from bs4 import BeautifulSoup
import time
import xmltodict
'''惠州'''
class Handler(My):
name = "HZ"
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://ghjs.huizhou.gov.cn/business/htmlfiles/ghjsj/ph_xzyjs/index.html',
callback=self.plan_page, age=1,
save={'type':self.table_name[0], 'source':'GH'})
self.crawl('http://ghjs.huizhou.gov.cn/business/htmlfiles/ghjsj/ph_ydghxkz/index.html',
callback=self.plan_page, age=1,
save={'type':self.table_name[1], 'source':'GH'})
self.crawl('http://ghjs.huizhou.gov.cn/business/htmlfiles/ghjsj/ph_gcghxkz/index.html',
callback=self.plan_page, age=1,
save={'type':self.table_name[2], 'source':'GH'})
self.crawl('http://ghjs.huizhou.gov.cn/business/htmlfiles/ghjsj/ph_ghyshgz/index.html',
callback=self.plan_page, age=1,
save={'type':self.table_name[4], 'source':'GH'})
# self.crawl('http://ghjs.huizhou.gov.cn/publicfiles/business/htmlfiles/ghjsj/pq_xzyjs/index.html',
# callback=self.plan_page, age=1,
# save={'type':self.table_name[9], 'source':'GH'})
# self.crawl('http://ghjs.huizhou.gov.cn/publicfiles/business/htmlfiles/ghjsj/pq_ydghxkz/index.html',
# callback=self.plan_page, age=1,
# save={'type':self.table_name[10], 'source':'GH'})
# self.crawl('http://ghjs.huizhou.gov.cn/publicfiles/business/htmlfiles/ghjsj/pq_gcghxkz/index.html',
# callback=self.plan_page, age=1,
# save={'type':self.table_name[11], 'source':'GH'})
self.headers = {}
self.headers['Accept'] = 'text/plain, */*; q=0.01'
self.headers['Accept-Encoding'] = 'gzip, deflate'
self.headers['Connection'] = 'keep-alive'
self.headers['Content-Length'] = '37'
self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
self.headers['Cookie'] = 'ASP.NET_SessionId=2nilcdld2ldvwxctoix1ykld'
self.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6'
self.headers['Origin'] = 'http://www.hzgtjy.com'
self.headers['RA-Sid'] = '6FA100BB-20150228-025839-140a2d-0496ae'
self.headers['RA-Ver'] = '3.0.7'
self.headers['Referer'] = 'http://www.hzgtjy.com/index/Index4/'
self.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36'
self.headers['X-Requested-With'] = 'XMLHttpRequest'
data = {}
data['page'] = 1
data['size'] = 10
data['orderBy'] = 'OFFERDATE-desc'
self.crawl('http://www.hzgtjy.com/Index/PublicResults?page=1', method='POST',
age=1, save={'type':self.table_name[14], 'source':'GT'},
headers=self.headers, data=data, callback=self.land_page)
def plan_page(self, response):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup('script', {'language':'JavaScript'})
l = t[1].get_text().split('\'')
x = l[13]
d = xmltodict.parse(x)
k = d['xml']['RECS']['INFO']
links = []
for i in k:
links.append(i['InfoURL'])
url = "http://ghjs.huizhou.gov.cn/publicfiles/business/htmlfiles/"
for i in links:
link = self.real_path(url, i)
# print(link)
self.crawl(link, callback=self.content_page, save=response.save)
time.sleep(0.1)
def land_page(self, response):
null = ''
true = 'true'
false = 'false'
json = eval(response.text)
lists = json['data']
domain = 'http://www.hzgtjy.com/Index/LandInfo/'
for i in lists:
link = domain + str(i['LANDINFO_ID'])
self.crawl(link, callback=self.content_page, save=response.save, fetch_type='js')
page_count = int((json['total'] + 9) / 10)
url, params_str = response.url.split('?')
params = {}
for i in params_str.split('&'):
temp = i.split('=')
params[temp[0]] = temp[1]
self.headers = {}
self.headers['Accept'] = 'text/plain, */*; q=0.01'
self.headers['Accept-Encoding'] = 'gzip, deflate'
self.headers['Connection'] = 'keep-alive'
self.headers['Content-Length'] = '37'
self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
self.headers['Cookie'] = 'ASP.NET_SessionId=2nilcdld2ldvwxctoix1ykld'
self.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6'
self.headers['Origin'] = 'http://www.hzgtjy.com'
self.headers['RA-Sid'] = '6FA100BB-20150228-025839-140a2d-0496ae'
self.headers['RA-Ver'] = '3.0.7'
self.headers['Referer'] = 'http://www.hzgtjy.com/index/Index4/'
self.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36'
self.headers['X-Requested-With'] = 'XMLHttpRequest'
for i in range(2, page_count + 1):
data = {'size':'10', 'orderBy':'OFFERDATE-desc'}
data['page'] = str(i)
params['page'] = str(i)
self.crawl(url, params=params, data=data, method='POST', callback=self.land_list_page,
age=1, save=response.save, headers=self.headers)
def land_list_page(self, response):
null = ''
true = 'true'
false = 'false'
json = eval(response.text)
lists = json['data']
domain = 'http://www.hzgtjy.com/Index/LandInfo/'
for i in lists:
link = domain + str(i['LANDINFO_ID'])
self.crawl(link, callback=self.content_page, save=response.save, fetch_type='js')