In [1]:
import requests
from requests.structures import CaseInsensitiveDict
import json
from lxml import etree
import os
import re
from IPython.display import HTML, display
import pickle

class MySearcherC4:
    """
    第4次课上集成的搜索类
    """
    def __init__(self):
        self.news_list = []
        self.fetch_data()

    def fetch_data(self):
        news_list_file = 'news_list.dat'

        if os.path.exists(news_list_file):
            with open(news_list_file, 'rb') as file:
                self.news_list = pickle.load(file)

        else:
            urls = ["https://tech.163.com/special/00097UHL/tech_datalist.js?callback=data_callback", "https://tech.163.com/special/00097UHL/tech_datalist_02.js?callback=data_callback", "https://tech.163.com/special/00097UHL/tech_datalist_03.js?callback=data_callback"]
            headers = CaseInsensitiveDict()
            headers["Referer"] = "https://tech.163.com/"
            headers["user-agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"

            url_set = set()
            processed_count = 0

            for url in urls:
                resp = requests.get(url, headers=headers)
                json_data = json.loads(resp.text[len('data_callback('):-1])

                for news in json_data:
                    title = news['title']
                    docurl = news['docurl']

                    if docurl not in url_set:
                        doc_resp = requests.get(docurl, headers=headers, timeout=30)
                        doc_resp.encoding = 'utf-8'
                        tree = etree.HTML(doc_resp.text)

                        post_body = tree.xpath("//div[@class='post_body']")
                        if post_body:
                            paragraphs = post_body[0].xpath(".//p")
                            html = ''.join(etree.tostring(p, method='html', encoding='unicode') for p in paragraphs)
                            text = ''.join(t.strip() for t in etree.HTML(html).xpath("//text()") if t.strip())

                            self.news_list.append([docurl, title, text])
                        
                        url_set.add(docurl)

                    processed_count += 1
                    if processed_count % 15 == 0:
                        print(f'{processed_count} processed.')

            if self.news_list:
                with open(news_list_file, 'wb') as file:
                    pickle.dump(self.news_list, file)

    def search_keywords(self, keyword):
        keyword_lower = keyword.lower()

        result = [(i, self.score(item, keyword)) for i, item in enumerate(self.news_list) if (keyword_lower in item[1].lower() or keyword_lower in item[2].lower())]
        result.sort(key=lambda x: x[1], reverse=True)
        return result
    

    def highlight(self, text, keyword):
        return re.sub(pattern=f'({keyword})', repl=r'<span style="color:#dd4b39">\1</span>', string=text, flags=re.IGNORECASE)


    def score(self, item, keyword):
        keyword_lower = keyword.lower()
        return (item[1].lower().count(keyword_lower) * 5 + item[2].lower().count(keyword_lower) * 3)
    
    def render_search_result(self, keyword):
        result = self.search_keywords(keyword)
        for item in result:
            clickable_title = f'<a href="{self.news_list[item[0]][0]}" target="_blank">{self.highlight(self.news_list[item[0]][1], keyword)}</a>'
            display(HTML(f'[{item[1]}] {clickable_title}'))

In [2]:
import timeit

class MySearcherC5V1(MySearcherC4):
    """
    增加scale参数 用于倍增news
    """
    def __init__(self, scale=1):
        super().__init__()
        self.news_list *= scale



In [3]:
%%time
searcher_1x = MySearcherC5V1()
searcher_10x = MySearcherC5V1(scale=10)
searcher_100x = MySearcherC5V1(scale=100)
searcher_1000x = MySearcherC5V1(scale=1000)


15 processed.
30 processed.
45 processed.
60 processed.
75 processed.
90 processed.
CPU times: total: 3.41 s
Wall time: 46.1 s


In [4]:
%time r=searcher_1x.search_keywords('ai')
%time r=searcher_10x.search_keywords('ai')

CPU times: total: 0 ns
Wall time: 5 ms
CPU times: total: 31.2 ms
Wall time: 30.9 ms


In [5]:
print('1x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_1x.search_keywords('ai')", globals=globals(), number=1))
print('10x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_10x.search_keywords('ai')", globals=globals(), number=1))
print('100x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_100x.search_keywords('ai')", globals=globals(), number=1))
print('1000x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_1000x.search_keywords('ai')", globals=globals(), number=1))


1x	 0.003
10x	 0.017
100x	 0.156
1000x	 1.547


In [6]:
print('1time10x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_10x.search_keywords('ai')", globals=globals(), number=1))
print('10times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_10x.search_keywords('ai')", globals=globals(), number=10))
print('100times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_10x.search_keywords('ai')", globals=globals(), number=100))
print('1000times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcher_10x.search_keywords('ai')", globals=globals(), number=1000))


1time10x	 0.017
10times10x	 0.143
100times10x	 1.463
1000times10x	 14.618


In [7]:
class MySearcherC5V2(MySearcherC5V1):
    """
    增加缓存机制 搜索相同关键词时 无需重新计算
    """

    def __init__(self, scale=1):
        super().__init__(scale)
        self.cache = {}

    def search_keywords(self, keyword):
        keyword_lower = keyword.lower()
        if keyword_lower in self.cache:
            result = self.cache[keyword_lower]
        else:
            result = [(i, self.score(item, keyword)) for i, item in enumerate(self.news_list) if (keyword_lower in item[1].lower() or keyword_lower in item[2].lower())]
            result.sort(key=lambda x: x[1], reverse=True)
            self.cache[keyword_lower] = result
        # 单一出口
        return result

In [8]:
%%time
searcherV2_10x = MySearcherC5V2(scale=10)

CPU times: total: 0 ns
Wall time: 998 µs


In [9]:


print('1time10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV2_10x.search_keywords('ai')", globals=globals(), number=1))
print('10times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV2_10x.search_keywords('ai')", globals=globals(), number=10))
print('100times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV2_10x.search_keywords('ai')", globals=globals(), number=100))
print('1000times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV2_10x.search_keywords('ai')", globals=globals(), number=1000))


1time10x	 0.036
10times10x	 0.000
100times10x	 0.000
1000times10x	 0.001


In [10]:
class MySearcherC5V3(MySearcherC5V2):
    """
    pseudo查询词 预处理 - 用线下处理 代替线上处理
    """
    def __init__(self, scale=1):
        super().__init__(scale)
        self.trending_words = set(['ai', '华为', 'iphone'])
        self.cache_trending_words()

    def cache_trending_words(self):
        for word in self.trending_words:
            r = self.search_keywords(word)




In [11]:
%%time
searcherV3_10x = MySearcherC5V3(scale=10)


CPU times: total: 62.5 ms
Wall time: 43.9 ms


In [12]:
print('1time10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV3_10x.search_keywords('ai')", globals=globals(), number=1))
print('10times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV3_10x.search_keywords('ai')", globals=globals(), number=10))
print('100times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV3_10x.search_keywords('ai')", globals=globals(), number=100))
print('1000times10x\t', '%0.3f' % timeit.timeit(stmt="r=searcherV3_10x.search_keywords('ai')", globals=globals(), number=1000))


1time10x	 0.000
10times10x	 0.000
100times10x	 0.000
1000times10x	 0.000


In [13]:
import jieba

class MySearcherC5V4(MySearcherC5V2):
    """
    分词得到的词（用文档过滤词库） 预处理
    """
    def __init__(self, scale=1):
        super().__init__(scale)
        self.cache_trending_words()

    def cache_trending_words(self):
        for news in self.news_list:
            for word in jieba.cut(news[1] + news[2], cut_all=True):
                r = self.search_keywords(word)



In [16]:
%%time
searcherV4_1x = MySearcherC5V4()


CPU times: total: 15.8 s
Wall time: 15.8 s


In [17]:
%%time
r = searcherV4_1x.search_keywords('苹果')

CPU times: total: 0 ns
Wall time: 0 ns
