In [1]:
import requests, time
from bs4 import BeautifulSoup

def get_news_list(root, url, date_boundary):
    article_list = []
    while 1:
        path = root + url
        html = requests.get(path)
        soup = BeautifulSoup(html.text)
        body = soup.find('ul', 'wp_article_list')  # 寻找对应的新闻列表
        for item in body.find_all('li'):  # 遍历新闻列表中的每条新闻组
            date = item.find('span', 'Article_PublishDate').string  # 通过每条新闻组中 	span的class获取时间字符串（不用.string获取的为tag）
            convert_date = time.mktime(time.strptime(date, "%Y-%m-%d"))  # 用秒数来表示时间
            if convert_date < date_boundary:
                break
            title = item.find('a')['title']
            href = item.find('a')['href']
            article_list.append([date, title, href])  # 将数据追加保存到article_list中
        # print(title, href ,date)
        if time.mktime(time.strptime(date, '%Y-%m-%d')) > date_boundary:
            next_page = soup.find('ul', 'wp_paging clearfix')
            url = next_page.find('a', {'class': 'next'})['href']
        else:
            break
    #print(article_list)
    return article_list

In [3]:
def crawl_news(root,article_list):
    news_dict = {}
    i = 1
    for item in article_list:
        date,title,href = item # 将数组中的三个元素分别赋值给变量
        #print(date,title,href)
        if 'http' in href:
            path = href
        else:
            path = root + href
        f = requests.get(path)
        soup = BeautifulSoup(f.text)
        body = soup.find('div',{'class','acd'})
        content = body.find('div',{'class','wp_articlecontent'})
        text = u''
        for a in content.strings: # 效果等同于text = content.text
            text += a
        
        publis = body.find_all('span','arti_publisher')[1].string
        department = publis.split(u'\uff1a')[1] # 将publis通过 ':'进行分割，效果等同于 publis.split(':')
        if department == '':
            department = 'none'
        view_times = body.find('span',['class','WP_VisitCount']).text
        #逐个把新闻信息加入字典
        news_dict[i] = {'date':date,'title':title,'source':department,'content':text,'views':view_times,'url':path}
        
        time.sleep(0.1) # 控制爬虫速度
        i += 1;
    return news_dict

def search(keywords,news_dict):
    result = {} # result为字典数据类型
    title_list = [] # 列表数据类型
    source_list = []
    content_list = []
    for x in news_dict: # 对于字典数据类型，x的值为1、2、3......
        if keywords in news_dict[x]['title']:
            title_list.append([x,news_dict[x]['date'],news_dict[x]['title']])
        if keywords in news_dict[x]['source']:
            source_list.append([x,news_dict[x]['date'],news_dict[x]['source']])
        if keywords in news_dict[x]['content']:
            content_list.append([x,news_dict[x]['date'],news_dict[x]['content']])
    
    result['title'] = title_list
    result['source'] = source_list
    result['content_list'] = content_list
    return result
        # print(x)

In [5]:
if __name__ == '__main__': # 如果这个module文件是被当成程序来执行，那么，该__name__属性的值就是"__main__"
    root = "http://www.suibe.edu.cn/_s19"
    url = "/1416/list.psp"
    date_boundary = time.mktime(time.strptime("2019-08-01", "%Y-%m-%d"))
    news_list = get_news_list(root, url, date_boundary)
    news_dict = crawl_news(root,news_list)
    result = search(u'上海对外经贸大学专场',news_dict)
    for x in result['title']:
        print(x[0],x[1])
        print(x[2])

In [6]:
result = search(u'上海对外经贸大学专场',news_dict)
for x in result['title']:
    print(x[0],x[1])
    print(x[2])