In [37]:
from datetime import datetime
import json
import os
import re

from lxml import etree
import requests

import utils


BASE_URL = 'https://s.weibo.com'
JSON_DIR = './raw'
ARCHIVE_DIR = './archives'


def getHTML(url):
    ''' 获取网页 HTML 返回字符串

    Args:
        url: str, 网页网址
    Returns:
        HTML 字符串
    '''
    # Cookie 有效期至2023-02-10
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
        'Cookie': 'SUB=_2AkMVWDYUf8NxqwJRmP0Sz2_hZYt2zw_EieKjBMfPJRMxHRl-yj9jqkBStRB6PtgY-38i0AF7nDAv8HdY1ZwT3Rv8B5e5; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFencmWZyNhNlrzI6f0SiqP'
    }
    response = requests.get(url, headers=headers)
    if response.encoding == 'ISO-8859-1':
        response.encoding = response.apparent_encoding if response.apparent_encoding != 'ISO-8859-1' else 'utf-8'
    return response.text


# 使用 xpath 解析 HTML
def parseHTMLByXPath(content):
    ''' 使用 xpath 解析 HTML, 提取榜单信息

    Args:
        content: str, 待解析的 HTML 字符串
    Returns:
        榜单信息的字典 字典
    '''
    html = etree.HTML(content)

    titles = html.xpath(
        '//tr[position()>1]/td[@class="td-02"]/a[not(contains(@href, "javascript:void(0);"))]/text()')
    hrefs = html.xpath(
        '//tr[position()>1]/td[@class="td-02"]/a[not(contains(@href, "javascript:void(0);"))]/@href')
    hots = html.xpath(
        '//tr[position()>1]/td[@class="td-02"]/a[not(contains(@href, "javascript:void(0);"))]/../span/text()')
    titles = [title.strip() for title in titles]
    hrefs = [BASE_URL + href.strip() for href in hrefs]
    zhongss = hots
    zhongs=[]
    for i in range(len(zhongss)):

        try:
            print(zhongss[i].split(' ')[-2])
            zhong = zhongss[i].split(' ')[-2]
        except:
            zhong = " "
        zhongs.append(zhong)
    print(zhongs)
    
        
    hots = [int(hot.strip().split(' ')[-1])
            for hot in hots]  # 该处除了热度还会返回大致分类，形如 `剧集 53412536`，前为分类，后为热度
    
    correntRank = {}
    for i, title in enumerate(titles):
        correntRank[title] = {'href': hrefs[i], 'hot': hots[i], 'zhong': zhongs[i]}

    return correntRank


# 更新本日榜单
def updateJSON(correntRank):
    ''' 更新当天的 JSON 文件

    Args:
        correntRank: dict, 最新的榜单信息
    Returns:
        与当天历史榜单对比去重, 排序后的榜单信息字典
    '''
    filename = datetime.today().strftime('%Y%m%d') + '.json'
    filename = os.path.join(JSON_DIR, filename)

    # 文件不存在则创建
    if not os.path.exists(filename):
        utils.save(filename, {})

    historyRank = json.loads(utils.load(filename))
    for k, v in correntRank.items():
        # 若当前榜单和历史榜单有重复的，取热度数值(名称后面的数值)更大的一个
        if k in historyRank:
            historyRank[k]['hot'] = max(
                historyRank[k]['hot'], correntRank[k]['hot'])
        # 若没有，则添加
        else:
            historyRank[k] = v

    # 将榜单按 hot 值排序
    rank = {k: v for k, v in sorted(
        historyRank.items(), key=lambda item: item[1]['hot'], reverse=True)}

    # 更新当天榜单 json 文件
    utils.save(filename, rank)
    print(rank)
    return rank


def updateReadme(rank):
    ''' 更新 README.md

    Args:
        rank: dict, 榜单信息
    Returns:
        None
    '''
    filename = './123.txt'

    line = '1. [{title}]({href}) {hot}'
    lines = [line.format(title=k, hot=v['hot'], href=v['href'])
             for k, v in rank.items()]
    rank = '\n'.join(lines)

    rank = '最后更新时间 {}\n\n'.format(
        datetime.now().strftime('%Y-%m-%d %X')) + rank
    rank = '<!-- Rank Begin -->\n\n' + rank + '\n<!-- Rank End -->'

    content = re.sub(
        r'<!-- Rank Begin -->[\s\S]*<!-- Rank End -->', rank, utils.load(filename))
    utils.save(filename, content)
    

def main():
    url = '/top/summary'

    content = getHTML(BASE_URL + url)
    # with open('test.html', 'w', encoding='utf-8') as f:
    #     f.write(content)
    # print(content)
    correntRank = parseHTMLByXPath(content)
    rankJSON = updateJSON(correntRank)
    updateReadme(rankJSON)


if __name__ == '__main__':
    main()





电影






剧集
剧集








晚会
音乐

电影







剧集
音乐


电影














['', '', '', '电影', '', '', '', '', '', '', '剧集', '剧集', '', '', '', '', '', '', '', '', '晚会', '音乐', '', '电影', '', '', '', '', '', '', '', '剧集', '音乐', '', '', '电影', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
{'王俊凯 大家不要抱着戾气生活': {'href': 'https://s.weibo.com/weibo?q=%E7%8E%8B%E4%BF%8A%E5%87%AF%20%E5%A4%A7%E5%AE%B6%E4%B8%8D%E8%A6%81%E6%8A%B1%E7%9D%80%E6%88%BE%E6%B0%94%E7%94%9F%E6%B4%BB&t=31&band_rank=1&Refer=top', 'hot': 1926227, 'zhong': ''}, '王嘉尔 透视装': {'href': 'https://s.weibo.com/weibo?q=%E7%8E%8B%E5%98%89%E5%B0%94%20%E9%80%8F%E8%A7%86%E8%A3%85&t=31&band_rank=2&Refer=top', 'hot': 1659076, 'zhong': ''}, '新冠治疗费用报销图解': {'href': 'https://s.weibo.com/weibo?q=%23%E6%96%B0%E5%86%A0%E6%B2%BB%E7%96%97%E8%B4%B9%E7%94%A8%E6%8A%A5%E9%94%80%E5%9B%BE%E8%A7%A3%23&t=31&band_rank=3&Refer=top', 'hot': 1641138, 'zhong': ''}, '邓超儿子等等长得好帅': {'href': 'https://s.weibo.com/weibo?q=%E9%82%93%E8%B6%85%E5%84%BF%E5%AD%90%E7%AD%89%E7%A

In [8]:
if __name__ == '__main__':
    main()


<!-- Rank Begin -->

最后更新时间 2023-01-11 15:01:38

1. [权志龙姐姐开北京首店](https://s.weibo.com/weibo?q=%23%E6%9D%83%E5%BF%97%E9%BE%99%E5%A7%90%E5%A7%90%E5%BC%80%E5%8C%97%E4%BA%AC%E9%A6%96%E5%BA%97%23&t=31&band_rank=1&Refer=top) 2001437
1. [一家4口三亚溺水全部遇难](https://s.weibo.com/weibo?q=%23%E4%B8%80%E5%AE%B64%E5%8F%A3%E4%B8%89%E4%BA%9A%E6%BA%BA%E6%B0%B4%E5%85%A8%E9%83%A8%E9%81%87%E9%9A%BE%23&t=31&band_rank=2&Refer=top) 1789683
1. [短发新娘不穿婚纱被误认成伴郎](https://s.weibo.com/weibo?q=%23%E7%9F%AD%E5%8F%91%E6%96%B0%E5%A8%98%E4%B8%8D%E7%A9%BF%E5%A9%9A%E7%BA%B1%E8%A2%AB%E8%AF%AF%E8%AE%A4%E6%88%90%E4%BC%B4%E9%83%8E%23&t=31&band_rank=1&Refer=top) 1497125
1. [新冠康复如何做到清淡饮食](https://s.weibo.com/weibo?q=%23%E6%96%B0%E5%86%A0%E5%BA%B7%E5%A4%8D%E5%A6%82%E4%BD%95%E5%81%9A%E5%88%B0%E6%B8%85%E6%B7%A1%E9%A5%AE%E9%A3%9F%23&t=31&band_rank=3&Refer=top) 1492124
1. [电费1个月3481元竟有电费刺客](https://s.weibo.com/weibo?q=%23%E7%94%B5%E8%B4%B91%E4%B8%AA%E6%9C%883481%E5%85%83%E7%AB%9F%E6%9C%89%E7%94%B5%E8%B4%B9%E5%88%BA%E5%AE%A2%23&t=31&band_

NameError: name 'url' is not defined