In [1]:
from bs4 import BeautifulSoup
from langchain.schema import Document

In [2]:
file_paths = ["/Users/rajithamuthukrishnan/Desktop/git/datasets/1/html/0009-01.html","/Users/rajithamuthukrishnan/Desktop/git/datasets/1/html/0015-01.html"]

In [3]:
len(file_paths)

2

# Extract Data and convert to LangDocs

In [4]:
def create_lang_documents(raw_docs):
    lang_docs = [Document(page_content=doc['body_text'], metadata={**doc['metadata']})
       for doc in raw_docs]
    return lang_docs

def extract_case_metadata(html_content: str) -> dict:
    soup = BeautifulSoup(html_content, "html.parser")
    
    body_text = soup.get_text(separator=" ", strip=True)
    # Extract - case id
    section = soup.find("section", {"class": "casebody"})
    case_id = section.get("data-case-id") if section else None
    # Extract - case title / name
    h4 = soup.find("h4", {"parties"})
    name = h4.get_text(strip=True) if h4 else None
    # Extract - attorneys
    attorneys = [
        tag.get_text(strip=True) for tag in soup.find_all("p",{"class", "attorneys"})
    ]
    # Extract - author
    author = soup.find("p",{"class","author"}).get_text(strip=True) if soup.find("p",{"class","author"}) else None
    
    return {
        "metadata":{
            "case_id": case_id,
            "case_name": name,
            "attorneys": attorneys,
            "author": author
        },
        "body_text": body_text
    }

def extract_data(file_list):
    data = []
    for file in file_list:
        if file.endswith('.html') or file.endswith('.htm'):
            with open(file, "r", encoding="utf-8") as f:
                html_content = f.read()
                data.append(extract_case_metadata(html_content)) 
                docs = create_lang_documents(data)
        else:
            print(f"Unsupported file type: {file}") 
    return docs

In [5]:
docs = extract_data(file_paths)

In [21]:
# docs

# Summarize Docs - Individual / Global

In [11]:
import copy
import operator
from typing import Annotated, List, TypedDict
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langgraph.types import Send
from langgraph.graph import END, START, StateGraph

In [17]:
async def summarize_docs(documents):
    new_docs = copy.deepcopy(documents)
    
    llm_model = OllamaLLM(
        model = "mistral"
    )
    
    map_template = "Write a concise summary based only on the context given below and not on your knowledge:\n{context}"
    reduce_template = """
The following is a set of summaries:
{summaries}
Take these and distill it into a final, consolidated summary of the main themes.
"""
    map_prompt = ChatPromptTemplate([("human"), map_template])
    reduce_prompt = ChatPromptTemplate([("human"), reduce_template])
    
    map_chain = map_prompt | llm_model | StrOutputParser()
    reduce_chain = reduce_prompt | llm_model | StrOutputParser()
    
    # overall state of the main graph - contains the input document contents,
    # corresponding summaries, and a final summary.
    class OverallState(TypedDict):
        contents: List[str]
        summaries: Annotated[list, operator.add]
        final_summary: str

    # state of the node that will "map" all documents in order to generate summaries
    class SummaryState(TypedDict):
        content: str
        
    async def generate_summary(state: SummaryState):
        response = await map_chain.ainvoke(state["content"])
        return {"summaries": [response]}

    def map_summaries(state: OverallState):
        return[
            Send("generate_summary", {"content": content}) for content in state["contents"]
        ]

    async def generate_final_summary(state: OverallState):
        response = await reduce_chain.ainvoke(state["summaries"])
        return {"final_summary": response}

    graph = StateGraph(OverallState)
    graph.add_node("generate_summary", generate_summary)
    graph.add_node("generate_final_summary", generate_final_summary)
    graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
    graph.add_edge("generate_summary", "generate_final_summary")
    graph.add_edge("generate_final_summary", END)
    app = graph.compile()
    
    results = await app.ainvoke({"contents": new_docs})
    for doc, summary in zip(new_docs, results['summaries']):
        doc.metadata['summary'] = summary
    new_docs.append(Document(metadata={'title':'final_summary'}, page_content=results['final_summary']))
    return new_docs

In [18]:
summarized_docs = await summarize_docs(docs)

In [20]:
# summarized_docs

# QA Tool

## Chunk docs

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    add_start_index=True,
)

In [8]:
chunks = text_splitter.split_documents(docs)

In [9]:
# chunks

## Embed the chunks

In [10]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

In [11]:
embedding = OllamaEmbeddings(
    model = "all-minilm:l6-v2",
)

faiss_vectorstore = FAISS.from_documents(chunks, embedding)

In [12]:
embedded_query=embedding.embed_query("Who is Bennet J?")

In [13]:
faiss_vectorstore.similarity_search_by_vector(
    embedded_query,
    k=2
)

[Document(id='0fb31cb7-324f-47ea-9950-e695b6f2ee18', metadata={'case_id': 'NOTALEPH000606_0001', 'case_name': 'The Peoplevs.Smith,et al.', 'attorneys': ['O. I). SempleandJohn B. Weller,for tbe applicants, and by', 'G. J. O.Keioen, (attorney general,) for tbe people.'], 'author': 'Bennett, J.', 'start_index': 2703}, page_content='comes up on tbe petition of the defendants to be discharged from, the custody of the sheriff of the district of Sonoma, under a writ of habeas corpus heretofore issued by this court. The return of the sheriff shows that the petitioners are detained by him by virtue of an order of the judge of First Instance of the distinct of Sonoma, and that such order was made upon the return of a warrant of arrest against the defendants, charging them with the commission of various felonious acts. Accompanying the return of the sheriff is also to be found a large amount of testimony taken on the examination, going to show that several Indians in the Nappa Valley were shot on

In [14]:
faiss_vectorstore.similarity_search(
    "Who is Bennet J",
    k=2
)

[Document(id='0fb31cb7-324f-47ea-9950-e695b6f2ee18', metadata={'case_id': 'NOTALEPH000606_0001', 'case_name': 'The Peoplevs.Smith,et al.', 'attorneys': ['O. I). SempleandJohn B. Weller,for tbe applicants, and by', 'G. J. O.Keioen, (attorney general,) for tbe people.'], 'author': 'Bennett, J.', 'start_index': 2703}, page_content='comes up on tbe petition of the defendants to be discharged from, the custody of the sheriff of the district of Sonoma, under a writ of habeas corpus heretofore issued by this court. The return of the sheriff shows that the petitioners are detained by him by virtue of an order of the judge of First Instance of the distinct of Sonoma, and that such order was made upon the return of a warrant of arrest against the defendants, charging them with the commission of various felonious acts. Accompanying the return of the sheriff is also to be found a large amount of testimony taken on the examination, going to show that several Indians in the Nappa Valley were shot on

## Q&A : with LLM

In [16]:
from langchain_core.prompts import ChatPromptTemplate
from typing_extensions import List, TypedDict
from langgraph.graph import START, StateGraph
from langchain_ollama import OllamaLLM
# from langgraph.checkpoint.memory import MemorySaver
# from langgraph.graph import MessageState

In [20]:
# Goes in _init()
qa_prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:
    <context>
    {context}
    </context>
    Question: {question}""")

llm_model = OllamaLLM(
        model = "mistral"
    )

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
              
def retrieve(state: State):
    retrieved_docs = faiss_vectorstore.similarity_search(state['question'])
    return {'context': retrieved_docs}

def generate(state: State):
    retrieved_docs_content = '\n\n'.join(doc.page_content for doc in state['context'])
    messages = qa_prompt.invoke({"question": state['question'], "context": retrieved_docs_content})
    response = llm_model.invoke(messages)
    return {'answer': response}

# Compile application and test
qa_workflow_builder = StateGraph(State).add_sequence([retrieve, generate])
qa_workflow_builder.add_edge(START, "retrieve")
qa_workflow = qa_workflow_builder.compile()

In [21]:
def ask_llm(question):
    response = qa_workflow.invoke({"question": question})
    return response

In [22]:
qa_response = ask_llm("Who is Bennet J?")

In [24]:
qa_response['answer']

' Based on the provided context, it appears that Bennet J. is a judge, as he is mentioned as delivering the opinion in the case discussed in the text.'

In [25]:
qa_response['context']

[Document(id='0fb31cb7-324f-47ea-9950-e695b6f2ee18', metadata={'case_id': 'NOTALEPH000606_0001', 'case_name': 'The Peoplevs.Smith,et al.', 'attorneys': ['O. I). SempleandJohn B. Weller,for tbe applicants, and by', 'G. J. O.Keioen, (attorney general,) for tbe people.'], 'author': 'Bennett, J.', 'start_index': 2703}, page_content='comes up on tbe petition of the defendants to be discharged from, the custody of the sheriff of the district of Sonoma, under a writ of habeas corpus heretofore issued by this court. The return of the sheriff shows that the petitioners are detained by him by virtue of an order of the judge of First Instance of the distinct of Sonoma, and that such order was made upon the return of a warrant of arrest against the defendants, charging them with the commission of various felonious acts. Accompanying the return of the sheriff is also to be found a large amount of testimony taken on the examination, going to show that several Indians in the Nappa Valley were shot on