In [1]:
import requests
from requests.structures import CaseInsensitiveDict
import json
from lxml import etree
import os
import re
from IPython.display import HTML, display
import pickle

# 最重要的7个字：封装，重用，可读性 -> 类化的改造
class MySearcher:
    def __init__(self):
        self.news_list = []
        self.fetch_data()

    # 获取新闻数据 如果没有数据文件则缓存到本地
    def fetch_data(self):
        news_list_file = 'news_list.dat'

        if os.path.exists(news_list_file):
            with open(news_list_file, 'rb') as file:
                self.news_list = pickle.load(file)

        else:
            urls = ["https://tech.163.com/special/00097UHL/tech_datalist.js?callback=data_callback", "https://tech.163.com/special/00097UHL/tech_datalist_02.js?callback=data_callback", "https://tech.163.com/special/00097UHL/tech_datalist_03.js?callback=data_callback"]
            headers = CaseInsensitiveDict()
            headers["Referer"] = "https://tech.163.com/"
            headers["user-agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"

            # set避免重复抓取
            url_set = set()
            processed_count = 0

            for url in urls:
                resp = requests.get(url, headers=headers)
                json_data = json.loads(resp.text[len('data_callback('):-1])

                for news in json_data:
                    title = news['title']
                    docurl = news['docurl']

                    if docurl not in url_set:
                        doc_resp = requests.get(docurl, headers=headers, timeout=30)
                        doc_resp.encoding = 'utf-8'
                        tree = etree.HTML(doc_resp.text)

                        post_body = tree.xpath("//div[@class='post_body']")
                        if post_body:
                            paragraphs = post_body[0].xpath(".//p")
                            # 将所有段落的HTML拼接
                            html = ''.join(etree.tostring(p, method='html', encoding='unicode') for p in paragraphs)
                            text = ''.join(t.strip() for t in etree.HTML(html).xpath("//text()") if t.strip())

                            # 在new_list里保存每条新闻的 1 链接 2 标题 3 全文
                            self.news_list.append([docurl, title, text])
                        
                        url_set.add(docurl)

                    processed_count += 1
                    if processed_count % 15 == 0:
                        print(f'{processed_count} processed.')

            if self.news_list:
                with open(news_list_file, 'wb') as file:
                    pickle.dump(self.news_list, file)

    def search_keywords(self, keyword):
        keyword_lower = keyword.lower()

        # 遍历news_list 1 判断关键字是否出现在标题 or 正文，2 若出现 则把index和计算后的得分加入结果
        result = [(i, self.score(item, keyword)) for i, item in enumerate(self.news_list) if (keyword_lower in item[1].lower() or keyword_lower in item[2].lower())]
        # 根据得分从高到低排序
        result.sort(key=lambda x: x[1], reverse=True)
        return result
    

    def highlight(self, text, keyword):
        return re.sub(pattern=f'({keyword})', repl=r'<span style="color:#dd4b39">\1</span>', string=text, flags=re.IGNORECASE)


    def score(self, item, keyword):
        keyword_lower = keyword.lower()
        return (item[1].lower().count(keyword_lower) * 5 + item[2].lower().count(keyword_lower) * 3)
    
    def render_search_result(self, keyword):
        result = self.search_keywords(keyword)
        for item in result:
            clickable_title = f'<a href="{self.news_list[item[0]][0]}" target="_blank">{self.highlight(self.news_list[item[0]][1], keyword)}</a>'
            display(HTML(f'[{item[1]}] {clickable_title}'))




In [2]:
searcher = MySearcher()

15 processed.
30 processed.
45 processed.
60 processed.
75 processed.
90 processed.


In [3]:
len(searcher.news_list)

90

In [4]:
searcher.render_search_result('ai')
