In [145]:
#this script summarizes the webpage from URL; writes its page content, meta data into a vector db

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI


import configparser, os
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#web loader and split
def web_loader_docs(link:str):
    #input: link of the web page url
    #web loader
    loader = WebBaseLoader(link)
    docs = loader.load()
    #splitter
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    #docs = text_splitter.split_documents(docs)
    return docs


#story summary chain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
#summary the story dialogue from main page content of the URL
def story_summary(docs):
    #input: docs of the web page
    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    # Map
    map_template = """Summarize the dialogue in the docs.
        {docs}
        只输出中文。只输出总结，不需要评论故事。
        输出:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """依次陈列这几段故事情节。
        {doc_summaries}
        只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
        输出:"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )
    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=6000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    #return the summary from the map and reduce procedure
    return map_reduce_chain.run(docs)

#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs):
    #input: docs of the web page

    # Define prompt
    prompt_template = """Summarize the dialogue in the text。
    "{text}"
    只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
    输出:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)

#summarize a web page from scappy meta data
def websummary_meta(meta:dict, overwrite = False, runlm = True):
    #input: meta data dictionary of the target page
    link = meta['source']
    #meta['characters'] = ','.join(meta['characters'])
    if (meta['stage'] == None):
        meta['stage'] = ""
    #load vector db for summary data
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())
    #get the db existing id set
    tmp = db.get()['ids']
    this_db_list = [x.split("_")[0] for x in tmp]
    this_db_set = set(this_db_list)
    #check whether the hash link is in the db already
    import hashlib
    this_id = str(int(hashlib.sha1(link.encode("utf-8")).hexdigest(), 16) % (10 ** 8))
    if this_id in this_db_set:
        if overwrite == False:
            return(link+" link already in the db, skip");
        else:
            index = this_db_list.index(this_id)
            story = db.get()['documents'][index]
    
    #load
    docs_org = web_loader_docs(link)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 20000, chunk_overlap = 500)
    docs = text_splitter.split_documents(docs_org)

    #find CG page from the webpage
    cg_link = get_cg(docs_org)
    meta['cg'] = cg_link

    #summarize the story
    if runlm == True:
        if len(docs) > 2:
            story = story_summary(docs);
        else:
            story = story_summary_stuff(docs);
    
    #write to vector db
    meta['indexed'] = True
    from langchain.docstore.document import Document
    output_doc = Document(page_content=story, metadata=meta);
    db.add_documents([output_doc], ids = [this_id])
    
    # #load the database for original page text
    # db2 = Chroma(persist_directory="./arkpage", embedding_function=OpenAIEmbeddings())
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)
    # docs2 = text_splitter.split_documents(docs_org)
    # this_list =[this_id + "_" + "{0:0=4d}".format(x) for x in range(len(docs2))]
    # output_docs2 =[Document(page_content=docs2[x].page_content, metadata=meta) for x in range(len(docs2))]
    # db2.add_documents(output_docs2, ids = this_list)
    
    return(output_doc)

#return CG link from the story
def get_cg(docs):
    #get the page link from dialogue
    regex=r'(?<=\[Image\(image=\")[\w_]+'
    pics = re.findall(regex, docs[0].page_content)
    if len(pics)==0:
        return("");
    link = "https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_"+pics[0]+".png"
    #get the 640px pic from the pic link
    from urllib.request import urlopen
    html_page = urlopen(link).read()
    pics = re.findall(r'https://[\w./-]+',str(html_page))
    if len(pics)==0:
        return("");
    if len(pics)>3:
        if '640' in pics[2]:
            return(pics[2]);
    return(pics[0]);

#main function, summarize a list of web pages from the scapy json
def run_scapy(file = "quotes.json", limit = "", overwrite = False, runlm = True):
    import json, time
    # Opening JSON file
    with open(file, encoding="utf-8") as f:
        scapy_list = json.load(f)

    for l in range(len(scapy_list)):
        if limit in scapy_list[l]["stage"]:
            print(websummary_meta(scapy_list[l], overwrite, runlm))
            scapy_list[l]['indexed'] = True;
            with open(file, "w", encoding="utf-8") as outfile:
                outfile.write(json.dumps(scapy_list))
            time.sleep(16)


In [148]:
run_scapy(file = "prts101023.json", limit = "CV-7", overwrite = True, runlm = True)

page_content='1. 迈尔斯和杰西卡等人在达维镇等车，迈尔斯回忆过去的事情。\n2. 车队首领宣布车队到达，海伦娜、伍德洛和杰西卡讨论是否要抢夺银行的钱。\n3. 西尔维娅加入讨论并提供银行的地图。\n4. 伍德洛和海伦娜对抢劫银行表示怀疑，但杰西卡决定加入他们。\n5. 杰西卡拿出自己的铳，表明自己的决心。\n6. 杰西卡找伍德洛哭诉，伍德洛安慰她。\n7. 杰西卡突然发现异常情况，决定支持他们并拿出自己的武器。' metadata={'source': 'https://prts.wiki/index.php?title=CV-7_%E7%99%BD%E7%83%AD/BEG&action=edit', 'indexed': True, 'stage': 'CV-7 白热 行动前', 'cg': ''}
page_content='芙兰卡、雷蛇、罗拉和杰西卡讨论了杰西卡的临时脱队申请和退队申请。杰西卡解释了她有一些私事需要处理，但她并不确定自己的选择是否值得。雷蛇表示不赞同杰西卡的选择，但最终同意了她的决定。随后，杰西卡和伍德洛、海伦娜、里昂一起进行了一次银行抢劫行动。在行动中，杰西卡展示了她的技能和勇气。最后，杰西卡收到了一张纸条，上面写着一些鼓励的话和祝福。\n\n西尔维娅告诉大家金库内还有两道门，一道栅栏门和一道重达数十吨的保险门。栅栏门需要密码打开，密码分为三段，行长和经理各掌握一段，还有一段由密码器随机生成的数字。海伦娜和里昂成功打开了保险箱，但里昂担心伍德洛那边的情况。接着，伍德洛和杰西卡遇到了银行经理，杰西卡要求经理告知密码，否则他们会被杀死。经理嘲笑他们的贪婪，并预言他们会被银行追踪并分食。最后，经理告诉他们密码，伍德洛准备引爆炸药。爆炸后，他们发现大量的钞票，但伍德洛提醒大家没有时间感慨，要开始装钱。杰西卡感叹自己第一次对金钱如此渴望。' metadata={'source': 'https://prts.wiki/index.php?title=CV-7_%E7%99%BD%E7%83%AD/END&action=edit', 'indexed': True, 'stage': 'CV-7 白热 行动后', 'cg': 'https://prts.wiki/images/thumb/e/e9/Avg_42_i02.png/640p

In [140]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())

In [105]:
db.get()['metadatas'][10]

{'cg': 'https://prts.wiki/images/thumb/f/f6/Avg_32_i03.png/640px-Avg_32_i03.png',
 'indexed': True,
 'source': 'https://prts.wiki/index.php?title=13-6_%E5%85%B8%E8%8C%83%E4%B9%8B%E5%90%8D/END&action=edit',
 'stage': '13-6 典范之名 行动后'}