|  | 
|  | 1 | +# -*- coding: utf-8 -*- | 
|  | 2 | +# 作者:             inspurer(月小水长) | 
|  | 3 | +# 创建时间:          2020/11/27 22:10 | 
|  | 4 | +# 运行环境           Python3.6+ | 
|  | 5 | +# github            https://github.com/inspurer | 
|  | 6 | +# qq邮箱            2391527690@qq.com | 
|  | 7 | +# 微信公众号         月小水长(ID: inspurer) | 
|  | 8 | +# 文件备注信息       todo | 
|  | 9 | + | 
|  | 10 | +import requests | 
|  | 11 | + | 
|  | 12 | +from datetime import datetime, timedelta | 
|  | 13 | + | 
|  | 14 | +from lxml import etree | 
|  | 15 | + | 
|  | 16 | +import csv | 
|  | 17 | + | 
|  | 18 | +import os | 
|  | 19 | + | 
|  | 20 | +from time import sleep | 
|  | 21 | +from random import randint | 
|  | 22 | + | 
|  | 23 | + | 
|  | 24 | +def parseTime(unformatedTime): | 
|  | 25 | +    if '分钟' in unformatedTime: | 
|  | 26 | +        minute = unformatedTime[:unformatedTime.find('分钟')] | 
|  | 27 | +        minute = timedelta(minutes=int(minute)) | 
|  | 28 | +        return (datetime.now() - | 
|  | 29 | +                minute).strftime('%Y-%m-%d %H:%M') | 
|  | 30 | +    elif '小时' in unformatedTime: | 
|  | 31 | +        hour = unformatedTime[:unformatedTime.find('小时')] | 
|  | 32 | +        hour = timedelta(hours=int(hour)) | 
|  | 33 | +        return (datetime.now() - | 
|  | 34 | +                hour).strftime('%Y-%m-%d %H:%M') | 
|  | 35 | +    else: | 
|  | 36 | +        return unformatedTime | 
|  | 37 | + | 
|  | 38 | + | 
|  | 39 | +def dealHtml(html): | 
|  | 40 | +    results = html.xpath('//div[@class="result-op c-container xpath-log new-pmd"]') | 
|  | 41 | + | 
|  | 42 | +    saveData = [] | 
|  | 43 | + | 
|  | 44 | +    for result in results: | 
|  | 45 | +        title = result.xpath('.//h3/a')[0] | 
|  | 46 | +        title = title.xpath('string(.)').strip() | 
|  | 47 | + | 
|  | 48 | +        summary = result.xpath('.//span[@class="c-font-normal c-color-text"]')[0] | 
|  | 49 | +        summary = summary.xpath('string(.)').strip() | 
|  | 50 | + | 
|  | 51 | +        # ./ 是直接下级,.// 是直接/间接下级 | 
|  | 52 | +        infos = result.xpath('.//div[@class="news-source"]')[0] | 
|  | 53 | +        source, dateTime = infos.xpath(".//span[last()-1]/text()")[0], \ | 
|  | 54 | +                           infos.xpath(".//span[last()]/text()")[0] | 
|  | 55 | + | 
|  | 56 | +        dateTime = parseTime(dateTime) | 
|  | 57 | + | 
|  | 58 | +        print('标题', title) | 
|  | 59 | +        print('来源', source) | 
|  | 60 | +        print('时间', dateTime) | 
|  | 61 | +        print('概要', summary) | 
|  | 62 | +        print('\n') | 
|  | 63 | + | 
|  | 64 | +        saveData.append({ | 
|  | 65 | +            'title': title, | 
|  | 66 | +            'source': source, | 
|  | 67 | +            'time': dateTime, | 
|  | 68 | +            'summary': summary | 
|  | 69 | +        }) | 
|  | 70 | +    with open(fileName, 'a+', encoding='utf-8-sig', newline='') as f: | 
|  | 71 | +        writer = csv.writer(f) | 
|  | 72 | +        for row in saveData: | 
|  | 73 | +            writer.writerow([row['title'], row['source'], row['time'], row['summary']]) | 
|  | 74 | + | 
|  | 75 | + | 
|  | 76 | +headers = { | 
|  | 77 | +    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', | 
|  | 78 | +    'Referer': 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word=%B0%D9%B6%C8%D0%C2%CE%C5&fr=zhidao' | 
|  | 79 | +} | 
|  | 80 | + | 
|  | 81 | +url = 'https://www.baidu.com/s' | 
|  | 82 | + | 
|  | 83 | +params = { | 
|  | 84 | +    'ie': 'utf-8', | 
|  | 85 | +    'medium': 0, | 
|  | 86 | +    # rtt=4 按时间排序 rtt=1 按焦点排序 | 
|  | 87 | +    'rtt': 1, | 
|  | 88 | +    'bsst': 1, | 
|  | 89 | +    'rsv_dl': 'news_t_sk', | 
|  | 90 | +    'cl': 2, | 
|  | 91 | +    'tn': 'news', | 
|  | 92 | +    'rsv_bp': 1, | 
|  | 93 | +    'oq': '', | 
|  | 94 | +    'rsv_btype': 't', | 
|  | 95 | +    'f': 8, | 
|  | 96 | +} | 
|  | 97 | + | 
|  | 98 | + | 
|  | 99 | +def doSpider(keyword, sortBy = 'focus'): | 
|  | 100 | +    ''' | 
|  | 101 | +    :param keyword: 搜索关键词 | 
|  | 102 | +    :param sortBy: 排序规则,可选:focus(按焦点排序),time(按时间排序),默认 focus | 
|  | 103 | +    :return: | 
|  | 104 | +    ''' | 
|  | 105 | +    global fileName | 
|  | 106 | +    fileName = '{}.csv'.format(keyword) | 
|  | 107 | + | 
|  | 108 | +    if not os.path.exists(fileName): | 
|  | 109 | +        with open(fileName, 'w+', encoding='utf-8-sig', newline='') as f: | 
|  | 110 | +            writer = csv.writer(f) | 
|  | 111 | +            writer.writerow(['title', 'source', 'time', 'summary']) | 
|  | 112 | + | 
|  | 113 | +    params['wd'] = keyword | 
|  | 114 | +    if sortBy == 'time': | 
|  | 115 | +        params['rtt'] = 4 | 
|  | 116 | + | 
|  | 117 | +    response = requests.get(url=url, params=params, headers=headers) | 
|  | 118 | + | 
|  | 119 | +    html = etree.HTML(response.text) | 
|  | 120 | + | 
|  | 121 | +    dealHtml(html) | 
|  | 122 | + | 
|  | 123 | +    total = html.xpath('//div[@id="header_top_bar"]/span/text()')[0] | 
|  | 124 | + | 
|  | 125 | +    total = total.replace(',', '') | 
|  | 126 | + | 
|  | 127 | +    total = int(total[7:-1]) | 
|  | 128 | + | 
|  | 129 | +    pageNum = total // 10 | 
|  | 130 | + | 
|  | 131 | +    for page in range(1, pageNum): | 
|  | 132 | +        print('第 {} 页\n\n'.format(page)) | 
|  | 133 | +        headers['Referer'] = response.url | 
|  | 134 | +        params['pn'] = page * 10 | 
|  | 135 | + | 
|  | 136 | +        response = requests.get(url=url, headers=headers, params=params) | 
|  | 137 | + | 
|  | 138 | +        html = etree.HTML(response.text) | 
|  | 139 | + | 
|  | 140 | +        dealHtml(html) | 
|  | 141 | + | 
|  | 142 | +        sleep(randint(2, 4)) | 
|  | 143 | +    ... | 
|  | 144 | + | 
|  | 145 | + | 
|  | 146 | +if __name__ == "__main__": | 
|  | 147 | +    doSpider(keyword = '马保国', sortBy='focus') | 
0 commit comments