In [1]:
#this script summarizes the webpage from URL; writes its page content, meta data into a vector db

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI


import configparser, os, re
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#web loader and split
def web_loader_docs(link:str):
    #input: link of the web page url
    #web loader
    loader = WebBaseLoader(link)
    docs = loader.load()
    #splitter
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    #docs = text_splitter.split_documents(docs)
    return docs


#story summary chain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
#summary the story dialogue from main page content of the URL
def story_summary(docs):
    #input: docs of the web page
    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    # Map
    map_template = """Summarize the dialogue in the docs.
        {docs}
        只输出中文。只输出总结，不需要评论故事。
        输出:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """依次陈列这几段故事情节。
        {doc_summaries}
        只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
        输出:"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )
    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=6000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    #return the summary from the map and reduce procedure
    return map_reduce_chain.run(docs)

#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs):
    #input: docs of the web page

    # Define prompt
    prompt_template = """Summarize the dialogue in the text。
    "{text}"
    只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
    输出:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)

#summarize a web page from scappy meta data
def websummary_meta(meta:dict, overwrite = False, runlm = True):
    #input: meta data dictionary of the target page
    link = meta['source']
    #meta['characters'] = ','.join(meta['characters'])
    if (meta['stage'] == None):
        meta['stage'] = ""
    #load vector db for summary data
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())
    #get the db existing id set
    tmp = db.get()['ids']
    this_db_list = [x.split("_")[0] for x in tmp]
    this_db_set = set(this_db_list)
    #check whether the hash link is in the db already
    import hashlib
    this_id = str(int(hashlib.sha1(link.encode("utf-8")).hexdigest(), 16) % (10 ** 8))
    if this_id in this_db_set:
        if overwrite == False:
            return(link+" link already in the db, skip");
        else:
            index = this_db_list.index(this_id)
            story = db.get()['documents'][index]
    
    #load
    docs_org = web_loader_docs(link)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 20000, chunk_overlap = 500)
    docs = text_splitter.split_documents(docs_org)

    #find CG page from the webpage
    cg_link = get_cg(docs_org)
    meta['cg'] = cg_link

    #summarize the story
    if runlm == True:
        if len(docs) > 2:
            story = story_summary(docs);
        else:
            story = story_summary_stuff(docs);
    
    #write to vector db
    meta['indexed'] = True
    from langchain.docstore.document import Document
    output_doc = Document(page_content=story, metadata=meta);
    db.add_documents([output_doc], ids = [this_id])
    
    # #load the database for original page text
    # db2 = Chroma(persist_directory="./arkpage", embedding_function=OpenAIEmbeddings())
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)
    # docs2 = text_splitter.split_documents(docs_org)
    # this_list =[this_id + "_" + "{0:0=4d}".format(x) for x in range(len(docs2))]
    # output_docs2 =[Document(page_content=docs2[x].page_content, metadata=meta) for x in range(len(docs2))]
    # db2.add_documents(output_docs2, ids = this_list)
    
    return(output_doc)

#return CG link from the story
def get_cg(docs):
    #get the page link from dialogue
    regex=r'(?<=\[Image\(image=\")[\w_]+'
    pics = re.findall(regex, docs[0].page_content)
    if len(pics)==0:
        return("");
    link = "https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_"+pics[0]+".png"
    print(link)
    #get the 640px pic from the pic link
    from urllib.request import urlopen
    try:
        html_page = urlopen(link).read()
    except:
        return("");
    pics = re.findall(r'https://[\w./-]+',str(html_page))
    if len(pics)==0:
        return("");
    if len(pics)>3:
        if '640' in pics[2]:
            return(pics[2]);
    return(pics[0]);

#main function, summarize a list of web pages from the scapy json
def run_scapy(file = "quotes.json", limit = "", overwrite = False, runlm = True):
    import json, time
    # Opening JSON file
    with open(file, encoding="utf-8") as f:
        scapy_list = json.load(f)

    for l in range(len(scapy_list)):
        if limit in scapy_list[l]["stage"]:
            print(scapy_list[l])
            print(websummary_meta(scapy_list[l], overwrite, runlm))
            scapy_list[l]['indexed'] = True;
            with open(file, "w", encoding="utf-8") as outfile:
                outfile.write(json.dumps(scapy_list))
            time.sleep(1)


In [2]:
run_scapy(file = "prts120523.json", limit = "RS-", overwrite = False, runlm = True)

{'source': 'https://prts.wiki/index.php?title=RS-ST-1_%E5%80%99%E8%BD%A6%E5%A4%A7%E5%8E%85/NBT&action=edit', 'indexed': False, 'stage': 'RS-ST-1 候车大厅'}
https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_45_i01_2.png


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


page_content='在这段对话中，谢拉格的领导人恩希欧迪斯与维多利亚子爵讨论了耶拉冈德像的落成仪式。恩希欧迪斯表示希望一切顺利，而维多利亚子爵则表达了对谢拉格的喜爱和对合作的期望。然而，恩希欧迪斯也提到了一些隐忧，包括耶拉冈德像的脸是否符合古代记载以及与公爵的合作是否会顺利进行。最后，他们约定在两天后的仪式上再继续讨论。\n\n老练的士兵告诉哈洛德他要忙去了，让哈洛德应付那群老爷。哈洛德问他们是不是又喝酒不带上他了，然后问他们把奶酪锅放哪了。接着，烈夏在火车上休息，然后听到广播宣布即将到达谢拉格圣山脚下。她下车后看到了驮兽和牧民，对驮兽很感兴趣。然后她遇到了哈洛德，他告诉她他们都是来观光的，然后邀请她一起去看耶拉冈德像。烈夏接受了邀请，他们一起踏上观光之旅。' metadata={'source': 'https://prts.wiki/index.php?title=RS-ST-1_%E5%80%99%E8%BD%A6%E5%A4%A7%E5%8E%85/NBT&action=edit', 'indexed': True, 'stage': 'RS-ST-1 候车大厅', 'cg': 'https://prts.wiki/images/thumb/4/44/Avg_45_i01_2.png/640px-Avg_45_i01_2.png'}
{'source': 'https://prts.wiki/index.php?title=RS-1_%E6%B3%A8%E6%84%8F%E4%BA%8B%E9%A1%B9/BEG&action=edit', 'indexed': False, 'stage': 'RS-1 注意事项 行动前'}
page_content='这段对话是明日方舟游戏中的剧情对话，具体内容包括角色之间的对话和动作描述。对话中涉及到一些角色的名字和动作，以及一些背景音效和特效的描述。' metadata={'source': 'https://prts.wiki/index.php?title=RS-1_%E6%B3%A8%E6%84%8F%E4%BA%8B%E9%A1%B9/BEG&action=edit', 'indexed': True, 'stage': 'RS-1 注意事项 行动前', 'cg': ''}
{'source': 'h

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


page_content='1. 列车到达终点站银心湖，介绍了银心湖的美景和耶拉冈德像。\n2. 哈洛德和锏之间发生了一些误会，但最终解开了。\n3. 烈夏加入了对话，询问了银心湖和弗根山的情况。\n4. 休露丝邀请烈夏参加耶拉冈德像的落成仪式。\n5. 恩希欧迪斯与一个富商讨论与喀兰贸易的合作。\n6. 烈夏遇到了休露丝，两人进行了一段对话。\n7. 灰礼帽点了一杯驮兽奶茶。\n8. 休露丝对菈塔托丝的任务感到困惑，她被要求盯着一个小女孩，并且需要继续接触对方。\n9. 休露丝觉得这个小女孩有点眼熟。\n10. 尤卡坦问休露丝是否需要帮忙，休露丝回答说事情交给她就可以了。\n11. 他们决定回去问菈塔托丝关于这个叫罗莎琳的小女孩的来历。' metadata={'source': 'https://prts.wiki/index.php?title=RS-3_%E4%B8%B4%E6%97%B6%E5%81%9C%E8%BD%A6/BEG&action=edit', 'indexed': True, 'stage': 'RS-3 临时停车 行动前', 'cg': 'https://prts.wiki/images/thumb/5/5a/Avg_45_i05.png/640px-Avg_45_i05.png'}
{'source': 'https://prts.wiki/index.php?title=RS-3_%E4%B8%B4%E6%97%B6%E5%81%9C%E8%BD%A6/END&action=edit', 'indexed': False, 'stage': 'RS-3 临时停车 行动后'}
page_content='1. 罗莎琳和她的妈妈一起向耶拉冈德祈祷。\n2. 烈夏来到耶拉冈德像前，试图祈祷，但不记得具体的步骤。老修士教她正确的姿势，并告诉她在耶拉冈德面前要闭上眼睛和默念耶拉冈德的名字。\n3. 烈夏开始祈祷，但突然变得激动起来，请求耶拉冈德保佑她的妈妈。老修士对烈夏的行为感到惊讶，但最终同意她的请求。\n4. 烈夏决定上山探险，老修士提供了住宿的地方。\n5. 山雪鬼们在监视烈夏，并讨论她的行动。\n6. 雅儿和恩雅讨论了耶拉冈德像的设计，雅儿希望能对雕像进行一些改动，但恩雅驳回了她的请求。' metadata={'source': 'htt

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


page_content='在这段对话中，魏斯给恩希欧迪斯送来了一封信，信是由哈洛德子爵托他转交的。恩希欧迪斯询问了魏斯关于哈洛德子爵的情况，并表示对他的态度感到好奇。魏斯回答说，哈洛德子爵对他们的监视已经了解得很清楚，并且通过这封信表达了一种态度。接着，魏斯向恩希欧迪斯汇报了他对维多利亚军队的观察，认为他们对当地人很尊重，值得尊重。恩希欧迪斯则表示对哈洛德子爵的态度和他们的行动感到好奇，并让魏斯继续观察。随后，他们讨论了信封中的内容，发现是一封邀请恩希欧迪斯和圣女大人参加晚宴的请帖。恩希欧迪斯决定亲自去取酒，表示要回应哈洛德子爵的诚意。接着，对话转到了另一个场景，阿克托斯和烈夏喝酒，但阿克托斯的酒变质了，烈夏却喝得很开心。最后，老修士让烈夏停止喝酒，并叫医生来检查她的情况。烈夏表示自己没事，因为她曾经吃过更糟糕的东西。\n在这段对话中，阿克托斯责备烈夏喝变质的酒，但烈夏表示自己曾经吃过更糟糕的东西。阿克托斯对此感到惊讶，但烈夏希望不再谈论过去的事情。她认为这瓶酒对阿克托斯来说很重要，不应该因为变质就丢掉。随后，哈洛德和油滑的士兵讨论晚宴的酒不够好，但哈洛德表示自己有一瓶珍藏的红酒。恩希欧迪斯和哈洛德交谈后，恩希欧迪斯送给哈洛德一瓶珍藏的酒。哈洛德感到惊喜，称赞这是一瓶好酒。恩希欧迪斯解释了酒的背景，并邀请哈洛德参加晚宴。在晚宴上，恩雅同意将这瓶酒作为正餐酒。哈洛德表示欢迎，晚宴正式开始。' metadata={'source': 'https://prts.wiki/index.php?title=RS-4_%E9%87%8D%E5%9B%9E%E6%AD%A3%E8%BD%A8/END&action=edit', 'indexed': True, 'stage': 'RS-4 重回正轨 行动后', 'cg': 'https://prts.wiki/images/thumb/8/8c/Avg_45_i07.png/640px-Avg_45_i07.png'}
{'source': 'https://prts.wiki/index.php?title=RS-ST-2_%E7%99%BB%E5%B1%B1%E9%93%81%E9%81%93/NBT&action=edit', 'indexed': False, 'stage': 'RS-ST-2 登山铁道'}
htt

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


page_content='1. 维多利亚士兵A和士兵B遭到袭击，维多利亚士兵C警告其他人小心。\n2. 锏女士出现并与维多利亚士兵战斗。\n3. 哈洛德与锏女士对话，讨论暴力与胜利的关系。\n4. 角峰和魏斯被绑在军营里，莫希出现并解救了他们。\n5. 恩雅和恩希欧迪斯讨论锏女士的能力和计划。\n6. 恩希欧迪斯和诺希斯讨论保护黑骑士和决策后果的问题。\n7. 锏继续战斗并受伤，但坚信自己能坚持下去。\n8. 锏和哈洛德讨论战争的必要性和后果。\n9. 锏决定继续战斗，哈洛德给恩希欧迪斯最后一次机会。\n10. "灰礼帽"遇到罗德岛的成员，并与他们交谈。\n11. 休露丝夫人邀请征战骑士参加耶拉冈德像的建成仪式。\n12. 莫布代表商业联合会表示愿意与贵公司合作。\n13. 锏和哈洛德讨论公爵阁下与希瓦艾什的恩怨以及卡西米尔的大饕餮们对喀兰贸易的看法。\n14. 锏决定回去，因为有人在等她。' metadata={'source': 'https://prts.wiki/index.php?title=RS-8_%E6%9E%81%E9%99%90%E6%97%B6%E9%80%9F/END&action=edit', 'indexed': True, 'stage': 'RS-8 极限时速 行动后', 'cg': 'https://prts.wiki/images/thumb/0/04/Avg_45_i04.png/640px-Avg_45_i04.png'}
{'source': 'https://prts.wiki/index.php?title=RS-ST-3_%E7%BB%88%E7%82%B9%E7%AB%99/NBT&action=edit', 'indexed': False, 'stage': 'RS-ST-3 终点站'}
https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_45_i01_2.png
page_content='1. 维多利亚士兵和阴沉的士兵讨论行动成功与否。\n2. 恩雅解释行动成功与耶拉冈德无关，并邀请士兵们参加庆祝宴会。\n3. 烈夏和博士讨论维多利亚和谢拉格之间的冲突以及他们的行动。\n4. 烈夏向阿克托斯表示不需要他的认可，并给他一些抚养费。\n5. 老修士和佩尔罗契家平民谈论塔季扬娜和

In [140]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())

In [105]:
db.get()['metadatas'][10]

{'cg': 'https://prts.wiki/images/thumb/f/f6/Avg_32_i03.png/640px-Avg_32_i03.png',
 'indexed': True,
 'source': 'https://prts.wiki/index.php?title=13-6_%E5%85%B8%E8%8C%83%E4%B9%8B%E5%90%8D/END&action=edit',
 'stage': '13-6 典范之名 行动后'}

In [8]:
link = "https://prts.wiki/index.php?title=ZT-1_%E6%B8%85%E5%94%B1%E2%80%9C%E6%99%B4%E7%A9%BA%E4%B9%8B%E6%AD%8C%E2%80%9D/BEG&action=edit"
res = web_loader_docs(link)


In [9]:
res

[Document(page_content='\n\n\n\n查看“ZT-1 清唱“晴空之歌”/BEG”的源代码 - PRTS - 玩家共同构筑的明日方舟中文Wiki\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n查看“ZT-1 清唱“晴空之歌”/BEG”的源代码\n\n←ZT-1 清唱“晴空之歌”/BEG\n\n\n跳到导航\n跳到搜索\n因为以下原因，您没有权限编辑本页：\n\n\n您请求的操作仅限属于这些用户组的用户执行：用户、\u200bEditor\n\n\n您可以查看和复制此页面的源代码。\n{{剧情模拟器|图片数据={{Widget:Data_Image}}|角色数据={{Widget:Data_Char}}|音频数据={{Widget:Data_Audio}}|文本数据=\n[HEADER(key="title_test", is_skippable=true, fit_mode="BLACK_MASK")]\n[Blocker(a=1, r=0, g=0, b=0, fadetime=0, block=true)]\n[stopmusic]\n[Dialog]\n[Delay(time=1)]\n[playMusic(intro="$loneliness_intro",key="$loneliness_loop", volume=0.6)]\n[Background(image="bg_ltroom",screenadapt="coverall")]\n[Blocker(a=0, r=0, g=0, b=0, fadetime=1, block=true)]\n[Delay(time=1)]\n[name="贵族侍从"]夫人，您该休息了。\n[name="贵族侍从"]这是......第多少遍了？您画了这么多遍，还是......\n[name="贵族侍从"]要不要我联系下美术馆的人，让他们把画展时间延后？\n[name="贵族侍从"]自从那位音乐家小姐来了以后，您比往常精神了许多。但您的病还没好透，就一直在画这幅......这幅......\n[name="贵族侍从"]您画的是......黑夜？