In [4]:
from typing import Any, List, Mapping, Optional
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
import requests

class CustomLLM(LLM):
    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        if isinstance(stop, list):
            stop = stop + ["\n###","\nObservation:","\n问题","\nQuestion:"]
        HOST = 'localhost:5000'
        URI = f'http://{HOST}/v1/completions'

        response = requests.post(
            URI,
            json={
                "prompt": prompt,
                "temperature": 0.1,
                "max_tokens": 16384,
                "stop_at_newline": True,
                "early_stopping": True,
                "stopping_strings": stop,
                'do_sample': True,
                'top_p': 0.1,
                'typical_p': 1,
                'repetition_penalty': 1.18,
                'top_k': 40,
                'min_length': 0,
                'no_repeat_ngram_size': 0,
                'num_beams': 1,
                'penalty_alpha': 0,
                'length_penalty': 1,
                'seed': -1,
                'add_bos_token': True,
                'truncation_length': 8192,
                'ban_eos_token': False,
                'skip_special_tokens': True,
            },
        )
        response.raise_for_status()
        return response.json()['choices'][0]['text']
  
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {}

class CustomLLM2(LLM):
    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        if isinstance(stop, list):
            stop = stop + ["\n###","\nObservation:","\n问题","\nQuestion:"]
        HOST = 'localhost:5000'
        URI = f'http://{HOST}/v1/chat/completions'

        response = requests.post(
            URI,
            json={
                "messages": [
                {
                    "role": "user",
                    "content": prompt
                  }
                ],
                "mode": "instruct",
                "instruction_template": "Alpaca",
            },
        )
        response.raise_for_status()
        return response.json()['choices'][0]['message']['content']
  
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {}

llm = CustomLLM2()

In [5]:
class FakeLLM(LLM):
    n: int

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        return prompt[: self.n]

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"n": self.n}

llm_fake = FakeLLM(n=10000)

In [8]:
#this script summarizes the webpage from URL; writes its page content, meta data into a vector db

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

import configparser, os
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#web loader and split
def web_loader_docs(link:str):
    #input: link of the web page url
    #web loader
    loader = WebBaseLoader(link)
    docs = loader.load()
    #splitter
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    #docs = text_splitter.split_documents(docs)
    return docs


def story_summary(docs):
    #input: docs of the web page
    # Define LLM chain
    #llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    # Map
    map_template = """The following are documents containing dialogues
        {docs}
        Write a detailed summary of the dialogues in each. ONLY summarize the dialogues.
        Output:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """
        {doc_summaries}
        """
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=llm_fake, prompt=reduce_prompt)
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )
    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=6000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    #return the summary from the map and reduce procedure
    return map_reduce_chain.run(docs)

#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs):
    #input: docs of the web page

    # Define prompt
    prompt_template = """Write a detailed summary of Amiya's dialogue.
    "{text}"
    Output:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    #llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)


In [7]:
link = "https://arknights.fandom.com/wiki/10-8/Story"
docs_org = web_loader_docs(link)
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
docs = text_splitter.split_documents(docs_org)
len(docs)

1

In [12]:
story_summary_stuff(docs)

"\nIn the tenth chapter of the Amiya story in Arknights, the Operator, Amiya, and the Self-Salvation Corps are planning a joint operation to rescue Heidi Thomson, a messenger who has been taken prisoner by the Sarkaz. Amiya, along with Clovisia, learns that she was captured while attending a social gathering, and her safety is important to both Rhodes Island and the Self-Salvation Corps as she possesses valuable information. The Sarkaz have been using an abandoned factory as a prison for their captives, and this location is considered a priority for the rescue mission. Clovisia also informs Amiya that a Self-Salvation Corps soldier, Lawrence, has come with crucial information about Heidi's location. The operation has been moved up due to the importance of rescuing Heidi, and Amiya and her team are eager to work together with the Self-Salvation Corps on this mission."

In [13]:
docs[0]

Document(page_content='10-8 story | Arknights Wiki | Fandom\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nArknights Wiki\n\n\n\n\n\n Explore\n\n \n\n\n\n\n Main Page\n\n\n\n\n Discuss\n\n\n\n\nAll Pages\n\n\n\n\nCommunity\n\n\n\n\nInteractive Maps\n\n\n\n\nRecent Blog Posts\n\n\n\n\n\n\n\n\nTerra\n\n \n\n\n\n\nStory\n\n\n\n\nFactions\n \n\n\n\n\nRhodes Island\n\n\n\n\nReunion\n\n\n\n\nDublinn\n\n\n\n\nKazdel\n\n\n\n\n\n\n\nRace\n\n\n\n\nOriginium\n \n\n\n\n\nOripathy\n\n\n\n\nInfected\n\n\n\n\nOriginium Arts\n\n\n\n\n\n\n\nCatastrophe\n\n\n\n\nNomadic city\n\n\n\n\nTimeline\n\n\n\n\n\n\n\n\nArknights\n\n \n\n\n\n\nOperators\n \n\n\n\n\nList of Operators\n \n\n\n\n\n6-star\n\n\n\n\n5-star\n\n\n\n\n4-star\n\n\n\n\n3-star\n\n\n\n\n2-star\n\n\n\n\n1-star\n\n\n\n\n\n\n\nHeadhunting\n \n\n\n\n\nBanners\n\n\n\n\n\n\n\nRecruitment\n \n\n\n\n\nTag combinations\n\n\n\n\n\n\n\nOutfits\n\n\n\n\nParadox Simulations\n\n\n\n\nOperator Records\n\n\n

In [31]:
link = "https://prts.wiki/index.php?title=13-21_%E6%AE%B7%E7%BA%A2%E5%90%9B%E4%B8%BB/END&action=edit"
docs_org = web_loader_docs(link)
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
docs = text_splitter.split_documents(docs_org)
#summarize the story, use map reduce chain if the story is too long
if len(docs) > 2:
    story = story_summary(docs);
else:
    story = story_summary_stuff(docs);
story

Token indices sequence length is longer than the specified maximum sequence length for this model (1802 > 1024). Running this sequence through the model will result in indexing errors


'\n        \nDialogue 13-21, labeled as "殷红君主/END," is a conversation between characters in the Chinese version of Genshin Impact\'s Wiki. This particular dialogue is not included in the game itself, but rather it appears in the Wiki. Due to the restricted editing permission, we\'re unable to directly access or provide the source code of this conversation. However, we can provide you with a summary of the content of the dialogue based on the available information.\n\nThe conversation encompasses a series of exchanges between various characters in the game. Given that the exact wording and nuances of their dialogue aren\'t directly accessible, here\'s a simplified description of the potential context:\n\n1. The conversation begins with a character expressing their gratitude to the player for their support and aid throughout their journey. This could be an NPC thanking the player for their assistance in a quest or series of quests.\n2. Another character may share their concerns or regret

In [32]:
#this script summarizes the webpage from URL; writes its page content, meta data into a vector db

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI


import configparser, os, re
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#web loader and split
def web_loader_docs(link:str):
    #input: link of the web page url
    #web loader
    loader = WebBaseLoader(link)
    docs = loader.load()
    #splitter
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    #docs = text_splitter.split_documents(docs)
    return docs


#story summary chain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
#summary the story dialogue from main page content of the URL
def story_summary(docs):
    #input: docs of the web page
    # Define LLM chain
    #llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    # Map
    map_template = """总结人物的对话。
        {docs}
        只输出中文。只输出总结，不需要评论故事。
        输出:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """依次陈列这几段故事情节。
        {doc_summaries}
        只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
        输出:"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    #reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    reduce_chain = LLMChain(llm=llm_fake, prompt=reduce_prompt)
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )
    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=6000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    #return the summary from the map and reduce procedure
    return map_reduce_chain.run(docs)

#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs):
    #input: docs of the web page

    # Define prompt
    prompt_template = """总结人物的对话。
    "{text}"
    只输出中文。只输出故事，不需要评论故事。不要输出重复片段!
    输出:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    #llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)

#summarize a web page from scappy meta data
def websummary_meta(meta:dict, overwrite = False, runlm = True):
    #input: meta data dictionary of the target page
    link = meta['source']
    #meta['characters'] = ','.join(meta['characters'])
    if (meta['stage'] == None):
        meta['stage'] = ""
    #load vector db for summary data
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    db = Chroma(persist_directory="./cndb", embedding_function=OpenAIEmbeddings())
    #get the db existing id set
    tmp = db.get()['ids']
    this_db_list = [x.split("_")[0] for x in tmp]
    this_db_set = set(this_db_list)
    #check whether the hash link is in the db already
    import hashlib
    this_id = str(int(hashlib.sha1(link.encode("utf-8")).hexdigest(), 16) % (10 ** 8))
    if this_id in this_db_set:
        if overwrite == False:
            return(link+" link already in the db, skip");
        else:
            index = this_db_list.index(this_id)
            story = db.get()['documents'][index]
    
    #load
    docs_org = web_loader_docs(link)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 20000, chunk_overlap = 500)
    docs = text_splitter.split_documents(docs_org)

    #find CG page from the webpage
    cg_link = get_cg(docs_org)
    meta['cg'] = cg_link

    #summarize the story
    if runlm == True:
        if len(docs) > 2:
            story = story_summary(docs);
        else:
            story = story_summary_stuff(docs);
    
    #write to vector db
    meta['indexed'] = True
    from langchain.docstore.document import Document
    output_doc = Document(page_content=story, metadata=meta);
    db.add_documents([output_doc], ids = [this_id])
    
    # #load the database for original page text
    # db2 = Chroma(persist_directory="./arkpage", embedding_function=OpenAIEmbeddings())
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)
    # docs2 = text_splitter.split_documents(docs_org)
    # this_list =[this_id + "_" + "{0:0=4d}".format(x) for x in range(len(docs2))]
    # output_docs2 =[Document(page_content=docs2[x].page_content, metadata=meta) for x in range(len(docs2))]
    # db2.add_documents(output_docs2, ids = this_list)
    
    return(output_doc)

#return CG link from the story
def get_cg(docs):
    #get the page link from dialogue
    regex=r'(?<=\[Image\(image=\")[\w_]+'
    pics = re.findall(regex, docs[0].page_content)
    if len(pics)==0:
        return("");
    link = "https://prts.wiki/w/%E6%96%87%E4%BB%B6:Avg_"+pics[0]+".png"
    print(link)
    #get the 640px pic from the pic link
    from urllib.request import urlopen
    try:
        html_page = urlopen(link).read()
    except:
        return("");
    pics = re.findall(r'https://[\w./-]+',str(html_page))
    if len(pics)==0:
        return("");
    if len(pics)>3:
        if '640' in pics[2]:
            return(pics[2]);
    return(pics[0]);

#main function, summarize a list of web pages from the scapy json
def run_scapy(file = "quotes.json", limit = "", overwrite = False, runlm = True):
    import json, time
    # Opening JSON file
    with open(file, encoding="utf-8") as f:
        scapy_list = json.load(f)

    for l in range(len(scapy_list)):
        if limit in scapy_list[l]["stage"]:
            print(scapy_list[l])
            print(websummary_meta(scapy_list[l], overwrite, runlm))
            scapy_list[l]['indexed'] = True;
            with open(file, "w", encoding="utf-8") as outfile:
                outfile.write(json.dumps(scapy_list))
            time.sleep(1)


In [33]:
link = "https://prts.wiki/index.php?title=13-21_%E6%AE%B7%E7%BA%A2%E5%90%9B%E4%B8%BB/END&action=edit"
docs_org = web_loader_docs(link)
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
docs = text_splitter.split_documents(docs_org)
#summarize the story, use map reduce chain if the story is too long
if len(docs) > 2:
    story = story_summary(docs);
else:
    story = story_summary_stuff(docs);
story

'依次陈列这几段故事情节。\n        \n这段源代码是来自明日方舟中文wiki的13-21殷红君主/END页面的 З组 ( Zach ) 版本。该段代码在跳过导航和搜索部分后开始，限定了编辑权限只有用户和编辑员组的成员才能编辑。同时，这段代码只输出了中文并且只输出了对话总结，不需要评论故事。\n\n本话题总зи人物对话。\n本角色对话包括String类型元素，用于表示文本内容，以及名称（character）和时长（duration）等元素，用于控制字符出言及其 Row 位置和移动效果等。\n主要介绍有 "血魔大君" 和 "Logos" 等角色在对话中互动的情况。前者拥有 high Magical resistance 和 high Physical resistance，表明其服装或具有强大的防御力，前后切换行数指明对话与其他对话分别出现在哪一行。\n文本内容中包含理论争议和对家庭背景的讽刺内容，特别是 "血魔大君" doiA3FdC的义情表述自己为 "萨卡" 的同。\n扩展解释：\n本对话中 "Logos" 为表达反感触发了 "血魔大君" 的议论力，导致行动时间降低并触发某些效果。 "血魔大君" 则自称为 "萨卡" 的同，所作行为被 "血魔大君" 解释为一种权reedom实现的表明，但 "Logos" 对此表示反感并自称会一会。 "血魔大君" 在此期间展示出和ockeyRefusal，并表示以己所为的行为无可，承认不是тісunaru的传承人，但对他的行为认为权力的含义不是引起伤或虚。\n文本内容中还包含一些文化引用，例如 "阿米" 针对 "血魔大君" 的发言中随itoriferences in the text. “火与血”可能是引用一种长青（Green-text）文化\n\n血魔大君正在宣言自己的复生时，被阿米娅和Logos一同曲唱挽歌，摧毁了他的结晶。血魔大君认为他已经把提卡兹的血还给了故乡，然而他只是在像 Са raison чество那样广泛的挥动着自己的卑劣的部件和苍白的骸骨，下一瞬就見到了一个消失再现的证Proof爱克斯。他宣言著自己将击败管理员，但实际上只是被挽歌歌啦erts证Proof爱克斯与Logos抓住了，自己被终装成了一块无法行动并被门店员收取的 ле器装品。血魔大君认为自己被敌人 Мар科兹控制着，但实际上他自己选择了纳陷，最终