In [5]:
#this script summarizes the webpage from URL; writes its page content, meta data into a vector db

from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.chat_models import ChatOpenAI


import configparser, os
config = configparser.ConfigParser()
config.read('./keys.ini')
os.environ['GOOGLE_API_KEY'] = config['GOOGLE']['GOOGLE_API_KEY']
os.environ['GOOGLE_CSE_ID'] = config['GOOGLE']['GOOGLE_CSE_ID']
openai_api_key = config['OPENAI']['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = openai_api_key

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#web loader and split
def web_loader_docs(link:str):
    #input: link of the web page url
    #web loader
    loader = WebBaseLoader(link)
    docs = loader.load()
    #splitter
    #text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    #docs = text_splitter.split_documents(docs)
    return docs


#story summary chain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
#summary the story dialogue from main page content of the URL
def story_summary(docs):
    #input: docs of the web page
    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    # Map
    map_template = """The following is a set of documents
        {docs}
        Write a detailed summary of the story in each. 
        Helpful Answer:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """The following is set of summaries:
        {doc_summaries}
        Take these and combine into a detailed summary of the story. List each section of the story seperately.
        Helpful Answer:"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )
    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=6000,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    #return the summary from the map and reduce procedure
    return map_reduce_chain.run(docs)

#use one 1 chain to summary the story dialogue from main page content of the URL
def story_summary_stuff(docs):
    #input: docs of the web page

    # Define prompt
    prompt_template = """Write a detailed summary of the story. List each section seperately.:
    "{text}"
    Output ONLY the story summary:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )
    
    return stuff_chain.run(docs)

#summary the metadata of the page
def meta_summary(docs):
    
    # Define prompt for the meta data summary
    prompt_template = """
    The input is an html page.
    Extract the following from the html page input
    1. the title, 
    2. a list of category, seperated by comma
    3. list of characters, seperated by comma
    4. the overall fandom / genere that the pages belong to

    "{input}"
    Outputs:"""
    prompt = PromptTemplate.from_template(prompt_template)
    
    #input the docs and prompt 
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")

    from langchain.chains import create_extraction_chain

    schema = {
        "properties": {
            "title": {"type": "string"},
            "category": {"type": "string"},
            "characters": {"type": "string"},
            "fandom": {"type": "string"},
        },
        "required": ["title","category","characters","fandom"],
    }

    extracted_content = create_extraction_chain(schema=schema, prompt=prompt, llm=llm).run(docs[0].page_content)
    return extracted_content[0]


#summarize a web page then write to vector db
def websummary(link:str):
    #input: link of the web page to summarize

    #load vector db
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    db = Chroma(persist_directory="./arkdb", embedding_function=OpenAIEmbeddings())
    #get the db existing id set
    tmp = db.get()['ids']
    this_db_set = set([x.split("_")[0] for x in tmp])
    #check whether the hash link is in the db already
    import hashlib
    this_id = str(int(hashlib.sha1(link.encode("utf-8")).hexdigest(), 16) % (10 ** 8))
    if (this_id in this_db_set):
        return(link+" link already in the db, skip");
    
    #load
    docs_org = web_loader_docs(link)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    docs = text_splitter.split_documents(docs_org)

    #summarize the story
    if len(docs) > 2:
        story = story_summary(docs);
    else:
        story = story_summary_stuff(docs);
    #summarize the meta data
    #meta = meta_summary(docs)
    #meta['source'] = link
    
    #write to vector db
    from langchain.docstore.document import Document
    output_doc = Document(page_content=story, metadata=meta);
    db.add_documents([output_doc], ids = [this_id])

    return(output_doc)


#main function, summarize a list of web page then write to vector db
def run(link:str):
    #input: a list of link of the web page to summarize
    import re, time
    link_list = re.findall('[\w/.\:\#\-]+',link)
    for l in link_list:
        if "https:" not in l:
            l = "https://"+l;
        print(websummary(l));
        time.sleep(15)
    return("summarize run successful")
        
#link = "https://arknights.fandom.com/wiki/R8-1/Story"
#print(websummary(link))

#summarize a web page from scappy meta data
def websummary_meta(meta:dict):
    #input: meta data dictionary of the target page
    link = meta['source']
    meta['characters'] = ','.join(meta['characters'])
    if (meta['stage'] == None):
        meta['stage'] = ""
    #load vector db for summary data
    from langchain.embeddings import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    db = Chroma(persist_directory="./arkdb", embedding_function=OpenAIEmbeddings())
    #get the db existing id set
    tmp = db.get()['ids']
    this_db_set = set([x.split("_")[0] for x in tmp])
    #check whether the hash link is in the db already
    import hashlib
    this_id = str(int(hashlib.sha1(link.encode("utf-8")).hexdigest(), 16) % (10 ** 8))
    if (this_id in this_db_set):
        return(link+" link already in the db, skip");
    
    #load
    docs_org = web_loader_docs(link)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
    docs = text_splitter.split_documents(docs_org)

    #summarize the story
    if len(docs) > 2:
        story = story_summary(docs);
    else:
        story = story_summary_stuff(docs);
    
    #write to vector db
    meta['indexed'] = True
    from langchain.docstore.document import Document
    output_doc = Document(page_content=story, metadata=meta);
    db.add_documents([output_doc], ids = [this_id])
    
    #load the database for original page text
    db2 = Chroma(persist_directory="./arkpage", embedding_function=OpenAIEmbeddings())
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)
    docs2 = text_splitter.split_documents(docs_org)
    this_list =[this_id + "_" + "{0:0=4d}".format(x) for x in range(len(docs2))]
    output_docs2 =[Document(page_content=docs2[x].page_content, metadata=meta) for x in range(len(docs2))]
    db2.add_documents(output_docs2, ids = this_list)
    
    return(output_doc)

#main function, summarize a list of web pages from the scapy json
def run_scapy(file = "quotes.json"):
    import json, time
    # Opening JSON file
    with open(file) as f:
        scapy_list = json.load(f)

    for l in range(len(scapy_list)):
        print(websummary_meta(scapy_list[l]))
        scapy_list[l]['indexed'] = True;
        with open("out_"+file, "w") as outfile:
            outfile.write(json.dumps(scapy_list))
        time.sleep(15)


In [70]:
run_scapy(file = "quotes.json")

https://arknights.fandom.com/wiki/Category:Stories link already in the db, skip
page_content='Unfortunately, the provided text does not contain a story summary.' metadata={'stage': '', 'episode': 'Lore', 'characters': '', 'source': 'https://arknights.fandom.com/wiki/Category:Stories?from=GA-2%2FStory', 'indexed': True}
page_content="The story begins with Indra and the Rhodes Island Operator preparing to attack Dublinn, a group outside of Londinium. They discuss the enemy's numbers and plan their strategy. Meanwhile, Dagda and Indra engage in combat with Dublinn soldiers inside an aircraft, rescuing a Londinium citizen who was being held captive. They discover that the citizen was a decoy and continue their mission to rescue the hostages. Back at Rhodes Island, Amiya and the Doctor assemble a squad and set off for a port in the Sudean Borough. They discuss their plan to infiltrate Londinium and the potential dangers they may face. They also meet with Mr. Thomas, a guide who agrees to he

In [89]:
link = "https://prts.wiki/index.php?title=13-14_%E7%8C%A9%E7%BA%A2%E7%83%9F%E7%81%AB/END&action=edit"
docs_org = web_loader_docs(link)
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
docs = text_splitter.split_documents(docs_org)
#summarize the story, use map reduce chain if the story is too long
if len(docs) > 2:
    story = story_summary(docs);
else:
    story = story_summary_stuff(docs);
story

"Summary 1:\nThe story begins with the protagonist, Margadar, in the town of Brentwood. She notices that her friend, Fleurida, is in distress and decides to help her. Margadar realizes that Fleurida needs someone to confide in and offers to be that person. However, their conversation is interrupted by a military officer who informs Margadar that he is leaving for battle. Margadar is concerned for Fleurida's safety and decides to warn the people at the construction site.\n\nSummary 2:\nAt the construction site, Margadar encounters the military officer again and questions why he hasn't left town. He reveals that he is there to protect something and believes that it is his duty to follow orders. Margadar realizes that he is referring to an upcoming ritual and becomes determined to stop it. However, the officer challenges her to get past him if she wants to interfere.\n\nSummary 3:\nMargadar manages to escape and warns the people at the construction site about the impending danger. They ar

In [2]:
link = "https://prts.wiki/index.php?title=13-21_%E6%AE%B7%E7%BA%A2%E5%90%9B%E4%B8%BB/END&action=edit"
docs_org = web_loader_docs(link)
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
docs = text_splitter.split_documents(docs_org)
#summarize the story, use map reduce chain if the story is too long
if len(docs) > 2:
    story = story_summary(docs);
else:
    story = story_summary_stuff(docs);
story

"Section 1: Introduction\nThe story begins with Logos and Amiya standing in a city drenched in blood rain. The Blood King, Sanguinarch, appears and mocks them for their failures. He taunts them to say their goodbyes, as this is their last chance.\n\nSection 2: Amiya's Rejection\nAmiya confronts Sanguinarch and rejects his actions and kingdom. She believes that the true power of blood lies in resistance and survival, not in violence and death. Sanguinarch dismisses her words, claiming he is the true master of blood.\n\nSection 3: Logos' Intervention\nLogos intervenes and uses his powers to stop the rain of blood and freeze Sanguinarch in place. He recites a mantra about the ceasing of wind and rain and the loss of power. However, Sanguinarch breaks free and mocks Logos for his futile attempts.\n\nSection 4: Amiya's Confrontation\nAmiya steps forward and confronts Sanguinarch, reiterating her rejection of his actions and kingdom. She believes that the true power of blood lies in resistan

In [6]:
link = "https://prts.wiki/index.php?title=13-14_%E7%8C%A9%E7%BA%A2%E7%83%9F%E7%81%AB/END&action=edit"
docs_org = web_loader_docs(link)
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 25000, chunk_overlap = 500)
docs = text_splitter.split_documents(docs_org)
#summarize the story, use map reduce chain if the story is too long
if len(docs) > 2:
    story = story_summary(docs);
else:
    story = story_summary_stuff(docs);
story

'故事1：布伦特伍德镇的阴谋\n这个故事发生在布伦特伍德镇，玛格达尔决定帮助遇到麻烦的朋友芙蕾达。然而，她卷入了一场更大的阴谋中。玛格达尔遇到了一名王庭军尉官，他告诉她即将发生的仪式和镇上的危险。玛格达尔决定警告其他人，但被困在温室里。在温室里，她与王庭军尉官相遇，他试图阻止她离开。最终，玛格达尔成功逃脱，并与其他人一起撤离了布伦特伍德镇。然而，他们发现镇上的驻军已被萨卡兹占领，他们必须与敌人战斗才能生存下去。在战斗中，玛格达尔和她的朋友们展现出了勇敢和决心，他们决心保护自己和所爱的人。\n\n故事2：战车火力手号角与维多利亚人的抵抗\n号角带领战车部队进入战场，摧毁了血魔的法阵。随后，推进之王和其他战士们展开了激烈的战斗，夺回了失地。然而，血魔的力量越来越强大，他们开始入侵维多利亚的城市。战车部队继续前进，摧毁了城市中的源石结晶，并搜救了被困的市民。故事以地狱入侵家乡的场景结束。'