In [17]:
#this script summarizes the webpage from URL; writes its page content, meta data into a vector db

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI


import configparser, os, re
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#web loader and split
def web_loader_docs(link:str):
    #input: link of the web page url
    #web loader
    loader = WebBaseLoader(link)
    docs = loader.load()
    #splitter
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    #docs = text_splitter.split_documents(docs)
    return docs


#story summary chain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
#summary the story dialogue from main page content of the URL
def story_summary(docs):
    #input: docs of the web page
    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    # Map
    map_template = """Summarize the dialogue in the docs.
        {docs}
        只输出中文。只输出总结，不需要评论故事。
        输出:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """依次陈列这几段故事情节。
        {doc_summaries}
        只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
        输出:"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )
    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=6000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    #return the summary from the map and reduce procedure
    return map_reduce_chain.run(docs)

#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs):
    #input: docs of the web page

    # Define prompt
    prompt_template = """Summarize the dialogue in the text。
    "{text}"
    只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
    输出:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)

#summarize a web page from scappy meta data
def websummary_meta(meta:dict, overwrite = False, runlm = True):
    #input: meta data dictionary of the target page
    link = meta['source']
    #meta['characters'] = ','.join(meta['characters'])
    if (meta['stage'] == None):
        meta['stage'] = ""
    #load vector db for summary data
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())
    #get the db existing id set
    tmp = db.get()['ids']
    this_db_list = [x.split("_")[0] for x in tmp]
    this_db_set = set(this_db_list)
    #check whether the hash link is in the db already
    import hashlib
    this_id = str(int(hashlib.sha1(link.encode("utf-8")).hexdigest(), 16) % (10 ** 8))
    if this_id in this_db_set:
        if overwrite == False:
            return(link+" link already in the db, skip");
        else:
            index = this_db_list.index(this_id)
            story = db.get()['documents'][index]
    
    #load
    docs_org = web_loader_docs(link)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 20000, chunk_overlap = 500)
    docs = text_splitter.split_documents(docs_org)

    #find CG page from the webpage
    cg_link = get_cg(docs_org)
    meta['cg'] = cg_link

    #summarize the story
    if runlm == True:
        if len(docs) > 2:
            story = story_summary(docs);
        else:
            story = story_summary_stuff(docs);
    
    #write to vector db
    meta['indexed'] = True
    from langchain.docstore.document import Document
    output_doc = Document(page_content=story, metadata=meta);
    db.add_documents([output_doc], ids = [this_id])
    
    # #load the database for original page text
    # db2 = Chroma(persist_directory="./arkpage", embedding_function=OpenAIEmbeddings())
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)
    # docs2 = text_splitter.split_documents(docs_org)
    # this_list =[this_id + "_" + "{0:0=4d}".format(x) for x in range(len(docs2))]
    # output_docs2 =[Document(page_content=docs2[x].page_content, metadata=meta) for x in range(len(docs2))]
    # db2.add_documents(output_docs2, ids = this_list)
    
    return(output_doc)

#return CG link from the story
def get_cg(docs):
    #get the page link from dialogue
    regex=r'(?<=\[Image\(image=\")[\w_]+'
    pics = re.findall(regex, docs[0].page_content)
    if len(pics)==0:
        return("");
    link = "https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_"+pics[0]+".png"
    print(link)
    #get the 640px pic from the pic link
    from urllib.request import urlopen
    try:
        html_page = urlopen(link).read()
    except:
        return("");
    pics = re.findall(r'https://[\w./-]+',str(html_page))
    if len(pics)==0:
        return("");
    if len(pics)>3:
        if '640' in pics[2]:
            return(pics[2]);
    return(pics[0]);

#main function, summarize a list of web pages from the scapy json
def run_scapy(file = "quotes.json", limit = "", overwrite = False, runlm = True):
    import json, time
    # Opening JSON file
    with open(file, encoding="utf-8") as f:
        scapy_list = json.load(f)

    for l in range(len(scapy_list)):
        if limit in scapy_list[l]["stage"]:
            print(scapy_list[l])
            print(websummary_meta(scapy_list[l], overwrite, runlm))
            scapy_list[l]['indexed'] = True;
            with open(file, "w", encoding="utf-8") as outfile:
                outfile.write(json.dumps(scapy_list))
            time.sleep(1)


In [18]:
run_scapy(file = "prts110123.json", limit = "ZT-", overwrite = True, runlm = False)

{'source': 'https://prts.wiki/index.php?title=ZT-ST-1_%E5%89%8D%E5%A5%8F%E2%80%9C%E6%84%8F%E5%A4%96%E5%BD%92%E6%9D%A5%E2%80%9D/NBT&action=edit', 'indexed': True, 'stage': 'ZT-ST-1 前奏“意外归来”', 'cg': ''}
page_content='施彤领大区的夜晚风雨交加，举行了一场音乐会。年长的贵族对演奏赞不绝口，年轻的贵族心思不在音乐上，而是对失去的爱情感到伤感。他们谈论了选帝侯的庆典和崔林特尔梅的情况。薇薇安娜和珂拉的对话中，珂拉告诉薇薇安娜她的父亲已经去世，并暗示她的父亲是选帝侯。遭遇袭击后，薇薇安娜使用暗影法术保护了他们。在杂货店遇到另一个女性，她告诉他们关于选帝侯的事情。最后，他们发现选帝侯的贴身侍从是刺客，并与她展开了战斗。\n\n金律法卫发现了一位不寻常的术师，并称呼他为“首席”。一位年轻的贵族准备离开，并提到了一些关于阿尔图罗的事情。费德里科出现并与年轻贵族对话，透露了他正在寻找阿尔图罗。薇薇安娜和珂拉的对话中，他们讨论了薇薇安娜的身份和她的父亲的计划。故事转移到恩瓦德大区的崔林特尔梅周边城镇，黑键与别格勒的对话中，黑键表达了他对于命运的理解和对于未来的期待。' metadata={'source': 'https://prts.wiki/index.php?title=ZT-ST-1_%E5%89%8D%E5%A5%8F%E2%80%9C%E6%84%8F%E5%A4%96%E5%BD%92%E6%9D%A5%E2%80%9D/NBT&action=edit', 'indexed': True, 'stage': 'ZT-ST-1 前奏“意外归来”', 'cg': ''}
{'source': 'https://prts.wiki/index.php?title=ZT-1_%E6%B8%85%E5%94%B1%E2%80%9C%E6%99%B4%E7%A9%BA%E4%B9%8B%E6%AD%8C%E2%80%9D/BEG&action=edit', 'indexed': True, 'stage': 'ZT-1 清唱“晴空之歌” 行动前', 'cg': ''}
ht

In [140]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())

In [105]:
db.get()['metadatas'][10]

{'cg': 'https://prts.wiki/images/thumb/f/f6/Avg_32_i03.png/640px-Avg_32_i03.png',
 'indexed': True,
 'source': 'https://prts.wiki/index.php?title=13-6_%E5%85%B8%E8%8C%83%E4%B9%8B%E5%90%8D/END&action=edit',
 'stage': '13-6 典范之名 行动后'}

In [8]:
link = "https://prts.wiki/index.php?title=ZT-1_%E6%B8%85%E5%94%B1%E2%80%9C%E6%99%B4%E7%A9%BA%E4%B9%8B%E6%AD%8C%E2%80%9D/BEG&action=edit"
res = web_loader_docs(link)


In [9]:
res

[Document(page_content='\n\n\n\n查看“ZT-1 清唱“晴空之歌”/BEG”的源代码 - PRTS - 玩家共同构筑的明日方舟中文Wiki\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n查看“ZT-1 清唱“晴空之歌”/BEG”的源代码\n\n←ZT-1 清唱“晴空之歌”/BEG\n\n\n跳到导航\n跳到搜索\n因为以下原因，您没有权限编辑本页：\n\n\n您请求的操作仅限属于这些用户组的用户执行：用户、\u200bEditor\n\n\n您可以查看和复制此页面的源代码。\n{{剧情模拟器|图片数据={{Widget:Data_Image}}|角色数据={{Widget:Data_Char}}|音频数据={{Widget:Data_Audio}}|文本数据=\n[HEADER(key="title_test", is_skippable=true, fit_mode="BLACK_MASK")]\n[Blocker(a=1, r=0, g=0, b=0, fadetime=0, block=true)]\n[stopmusic]\n[Dialog]\n[Delay(time=1)]\n[playMusic(intro="$loneliness_intro",key="$loneliness_loop", volume=0.6)]\n[Background(image="bg_ltroom",screenadapt="coverall")]\n[Blocker(a=0, r=0, g=0, b=0, fadetime=1, block=true)]\n[Delay(time=1)]\n[name="贵族侍从"]夫人，您该休息了。\n[name="贵族侍从"]这是......第多少遍了？您画了这么多遍，还是......\n[name="贵族侍从"]要不要我联系下美术馆的人，让他们把画展时间延后？\n[name="贵族侍从"]自从那位音乐家小姐来了以后，您比往常精神了许多。但您的病还没好透，就一直在画这幅......这幅......\n[name="贵族侍从"]您画的是......黑夜？