-
Notifications
You must be signed in to change notification settings - Fork 0
/
sohu_spider.py
55 lines (41 loc) · 1.22 KB
/
sohu_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#搜狐新闻爬虫
import requests
from lxml import html
class Model(object):
def __repr__(self):
class_name = self.__class__.__name__
properties = ('{} = ({})'.format(k, v) for k, v in self.__dict__.items())
return '\n<{}:\n {}\n>'.format(class_name, '\n '.join(properties))
class New(Model):
def __init__(self):
self.url = ""
self.title = ""
def new_from_div(div):
new = New()
new.url = div.xpath('.//a[@class="h4"]/@href')[0]
new.title = div.xpath('.//a[@class="h4"]')[0].text
print(new.url, new.title)
return new
def cached_url(url):
import os
filename = 'sohu_news.html'
path = os.path.join('cached', filename)
if os.path.exists(path):
with open(path, 'rb') as f:
return f.read()
else:
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
return r.content
def news_from_url(url):
page = cached_url(url)
root = html.fromstring(page)
new_divs = root.xpath('//div[@class="h4WP"]')
news = [new_from_div(div) for div in new_divs]
return news
def main():
url = 'https://m.sohu.com/'
news = news_from_url(url)
if __name__ == '__main__':
main()