In [8]:
#this script summarizes the webpage from URL; writes its page content, meta data into a vector db

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI


import configparser, os, re
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#web loader and split
def web_loader_docs(link:str):
    #input: link of the web page url
    #web loader
    loader = WebBaseLoader(link)
    docs = loader.load()
    #splitter
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    #docs = text_splitter.split_documents(docs)
    return docs


#story summary chain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
#summary the story dialogue from main page content of the URL
def story_summary(docs):
    #input: docs of the web page
    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    # Map
    map_template = """Summarize the dialogue in the docs.
        {docs}
        只输出中文。只输出总结，不需要评论故事。
        输出:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """依次陈列这几段故事情节。
        {doc_summaries}
        只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
        输出:"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )
    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=6000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    #return the summary from the map and reduce procedure
    return map_reduce_chain.run(docs)

#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs):
    #input: docs of the web page

    # Define prompt
    prompt_template = """Summarize the dialogue in the text。
    "{text}"
    只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
    输出:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)

#summarize a web page from scappy meta data
def websummary_meta(meta:dict, overwrite = False, runlm = True, limit = ""):
    #input: meta data dictionary of the target page
    link = meta['source']
    #meta['characters'] = ','.join(meta['characters'])
    if (meta['stage'] == None):
        meta['stage'] = ""
    if not(limit in meta['stage']):
        meta['stage'] = limit + "-" + meta['stage']
    #load vector db for summary data
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())
    #get the db existing id set
    tmp = db.get()['ids']
    this_db_list = [x.split("_")[0] for x in tmp]
    this_db_set = set(this_db_list)
    #check whether the hash link is in the db already
    import hashlib
    this_id = str(int(hashlib.sha1(link.encode("utf-8")).hexdigest(), 16) % (10 ** 8))
    story = "None"
    if this_id in this_db_set:
        if overwrite == False:
            return(link+" link already in the db, skip");
        else:
            index = this_db_list.index(this_id)
            story = db.get()['documents'][index]
    
    #load
    docs_org = web_loader_docs(link)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 20000, chunk_overlap = 500)
    docs = text_splitter.split_documents(docs_org)

    #find CG page from the webpage
    cg_link = get_cg(docs_org)
    meta['cg'] = cg_link

    #summarize the story
    if runlm == True:
        if len(docs) > 2:
            story = story_summary(docs);
        else:
            story = story_summary_stuff(docs);
    
    #write to vector db
    meta['indexed'] = True
    from langchain.docstore.document import Document
    output_doc = Document(page_content=story, metadata=meta);
    db.add_documents([output_doc], ids = [this_id])
    
    # #load the database for original page text
    # db2 = Chroma(persist_directory="./arkpage", embedding_function=OpenAIEmbeddings())
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)
    # docs2 = text_splitter.split_documents(docs_org)
    # this_list =[this_id + "_" + "{0:0=4d}".format(x) for x in range(len(docs2))]
    # output_docs2 =[Document(page_content=docs2[x].page_content, metadata=meta) for x in range(len(docs2))]
    # db2.add_documents(output_docs2, ids = this_list)
    
    return(output_doc)

#return CG link from the story
def get_cg(docs):
    #get the page link from dialogue
    regex=r'(?<=\[Image\(image=\")[\w_]+'
    pics = re.findall(regex, docs[0].page_content)
    if len(pics)==0:
        return("");
    link = "https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_"+pics[0]+".png"
    print(link)
    #get the 640px pic from the pic link
    from urllib.request import urlopen
    try:
        html_page = urlopen(link).read()
    except:
        return("");
    pics = re.findall(r'https://[\w./-]+',str(html_page))
    if len(pics)==0:
        return("");
    if len(pics)>3:
        if '640' in pics[2]:
            return(pics[2]);
    return(pics[0]);

#main function, summarize a list of web pages from the scapy json
def run_scapy(file = "quotes.json", limit = "", overwrite = False, runlm = True):
    import json, time
    # Opening JSON file
    with open(file, encoding="utf-8") as f:
        scapy_list = json.load(f)

    for l in range(len(scapy_list)):
        if (limit in scapy_list[l]["stage"]) | (limit in scapy_list[l]["source"]):
            print(scapy_list[l])
            print(websummary_meta(scapy_list[l], overwrite, runlm, limit))
            scapy_list[l]['indexed'] = True;
            with open(file, "w", encoding="utf-8") as outfile:
                outfile.write(json.dumps(scapy_list))
            time.sleep(1)


In [10]:
run_scapy(file = "prts010924.json", limit = "RO3", overwrite = True, runlm = False)

{'source': 'https://prts.wiki/index.php?title=RO3-BEG/NBT&action=edit', 'indexed': True, 'stage': 'RO-序章', 'cg': ''}


Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179


page_content='None' metadata={'source': 'https://prts.wiki/index.php?title=RO3-BEG/NBT&action=edit', 'indexed': True, 'stage': 'RO3-RO-序章', 'cg': ''}
{'source': 'https://prts.wiki/index.php?title=RO3-END-1/NBT&action=edit', 'indexed': True, 'stage': 'RO-越过群山', 'cg': 'https://prts.wiki/images/thumb/7/7c/Avg_pic_rogue_3_31.png/640px-Avg_pic_rogue_3_31.png'}
https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_pic_rogue_3_31.png


Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179


page_content='None' metadata={'source': 'https://prts.wiki/index.php?title=RO3-END-1/NBT&action=edit', 'indexed': True, 'stage': 'RO3-RO-越过群山', 'cg': 'https://prts.wiki/images/thumb/7/7c/Avg_pic_rogue_3_31.png/640px-Avg_pic_rogue_3_31.png'}
{'source': 'https://prts.wiki/index.php?title=RO3-END-2/NBT&action=edit', 'indexed': True, 'stage': 'RO-直至冬夜降临', 'cg': 'https://prts.wiki/images/thumb/5/57/Avg_pic_rogue_3_32.png/640px-Avg_pic_rogue_3_32.png'}
https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_pic_rogue_3_32.png


Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179


page_content='None' metadata={'source': 'https://prts.wiki/index.php?title=RO3-END-2/NBT&action=edit', 'indexed': True, 'stage': 'RO3-RO-直至冬夜降临', 'cg': 'https://prts.wiki/images/thumb/5/57/Avg_pic_rogue_3_32.png/640px-Avg_pic_rogue_3_32.png'}
{'source': 'https://prts.wiki/index.php?title=RO3-END-3/NBT&action=edit', 'indexed': True, 'stage': 'RO-自深处的一瞥', 'cg': ''}
https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_pic_rogue_3_33.png


Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179


page_content='None' metadata={'source': 'https://prts.wiki/index.php?title=RO3-END-3/NBT&action=edit', 'indexed': True, 'stage': 'RO3-RO-自深处的一瞥', 'cg': 'https://prts.wiki/images/thumb/6/6a/Avg_pic_rogue_3_33.png/640px-Avg_pic_rogue_3_33.png'}
{'source': 'https://prts.wiki/index.php?title=RO3-END-4/NBT&action=edit', 'indexed': True, 'stage': 'RO-终始', 'cg': 'https://prts.wiki/images/thumb/3/39/Avg_pic_rogue_3_35.png/640px-Avg_pic_rogue_3_35.png'}
https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_pic_rogue_3_35.png


Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179
Delete of nonexisting embedding ID: 176
Delete of nonexisting embedding ID: 177
Delete of nonexisting embedding ID: 178
Delete of nonexisting embedding ID: 179


page_content='None' metadata={'source': 'https://prts.wiki/index.php?title=RO3-END-4/NBT&action=edit', 'indexed': True, 'stage': 'RO3-RO-终始', 'cg': 'https://prts.wiki/images/thumb/3/39/Avg_pic_rogue_3_35.png/640px-Avg_pic_rogue_3_35.png'}


In [21]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())

In [22]:
db.get()['metadatas'][10]

{'cg': 'https://prts.wiki/images/thumb/f/f6/Avg_32_i03.png/640px-Avg_32_i03.png',
 'indexed': True,
 'source': 'https://prts.wiki/index.php?title=13-6_%E5%85%B8%E8%8C%83%E4%B9%8B%E5%90%8D/END&action=edit',
 'stage': '13-6 典范之名 行动后'}

In [8]:
link = "https://prts.wiki/index.php?title=ZT-1_%E6%B8%85%E5%94%B1%E2%80%9C%E6%99%B4%E7%A9%BA%E4%B9%8B%E6%AD%8C%E2%80%9D/BEG&action=edit"
res = web_loader_docs(link)


In [23]:
import chromadb
client = chromadb.PersistentClient(path="./cndb")

from chromadb.utils import embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=os.environ['OPENAI_API_KEY'],
                model_name="text-embedding-ada-002"
            )
db = client.get_collection(name="langchain", embedding_function=openai_ef)
ids = []
this = db.get()
for id in range(len(this['ids'])):
    if "TG-" in this['metadatas'][id]['source']:
        ids.append(this['ids'][id])
        print(this['ids'][id])
        print(this['metadatas'][id]['source'])
        print(this['documents'][id])

58637907
https://prts.wiki/index.php?title=TG-ST1_%E7%9B%96%E4%B8%8D%E4%BD%8F%E7%9A%84%E9%94%85%E7%9B%96/NBT&action=edit
故事发生在一个小镇上，阿兰娜和小锅盖是好朋友。他们一起在阿兰娜的运载车上工作，准备离开这个小镇。阿兰娜做了很多菜，为了让菜保持热，她敲碗提醒它们慢慢变凉。他们聊了很多关于食物的话题，然后听到了奇怪的声音。他们发现是一个陌生人在敲门，他们决定去看看。在路上，小锅盖不小心推车撞到了一个人，但幸运的是没有受伤。他们来到了一个百货店，阿兰娜想买一只沙地兽，而小锅盖想卖掉一些东西。百货店老板同意帮助他们，但他们需要找到特定型号的安全阀。然而，他们没有找到合适的阀门，他们决定继续寻找。故事结束时，他们决定分工，继续努力寻找所需的物品。

在对话中，小锅盖想要找到一些东西，请求百货店老板帮忙看着推车。百货店老板询问阿兰娜是否还需要购买其他东西。沉默寡言的顾客没有回答。阿兰娜和小查理之间发生了一些争执，他们讨论了阿兰娜照顾小孩的问题。阿兰娜解释了她为什么要照顾小孩，并威胁要把球踢到小查理的脸上。他们继续讨论阿兰娜是否会回来，最后决定明天再谈。小锅盖去买东西，回来后发现阿兰娜已经完成了工作，并和叔叔们打球。阿兰娜解释了她为什么能这么快完成工作。他们讨论了剩下的工作和明天的行程。小锅盖提议帮忙交《停运报告书》，以便阿兰娜能多和朋友们聊一会儿。最后，小锅盖遇到一个畏畏缩缩的乘客，他们讨论了乘车时间和等待的人。小锅盖最后决定帮助乘客等待的人。

在这段对话中，小锅盖正在等待进入办公室提交文件，但是时间很紧迫。她和一位名叫雷厉风行的女性聊天，女性抱怨人太多，办事效率低下。小锅盖担心文件来不及交，但是女性表示可以帮她交。小锅盖犹豫不决，但最终还是把文件交给了女性。之后，小锅盖赶上了车，但她心里还有些不安。最后，她听到广播说车要去咧嘴谷，但突然发现车被劫持了。
30681391
https://prts.wiki/index.php?title=TG-ST2_%E4%B8%8D%E5%9B%9E%E5%A4%B4%E7%9A%84%E8%BD%A6%E8%BE%99/NBT&action=edit
这段对话发生在一个运载车上，阿兰娜是驾驶员，而劫车人是乘客。

In [20]:
#db.delete(ids=ids[:3])