In [7]:
from bs4 import BeautifulSoup
from langchain.schema import Document

In [2]:
file_paths = ["/Users/rajithamuthukrishnan/Desktop/git/datasets/1/html/0009-01.html","/Users/rajithamuthukrishnan/Desktop/git/datasets/1/html/0015-01.html"]

In [4]:
len(file_paths)

2

In [5]:
# Extract Data and convert to LangDocs

In [8]:
def create_lang_documents(raw_docs):
    lang_docs = [Document(page_content=doc['body_text'], metadata={**doc['metadata']})
       for doc in raw_docs]
    return lang_docs

def extract_case_metadata(html_content: str) -> dict:
    soup = BeautifulSoup(html_content, "html.parser")
    
    body_text = soup.get_text(separator=" ", strip=True)
    # Extract - case id
    section = soup.find("section", {"class": "casebody"})
    case_id = section.get("data-case-id") if section else None
    # Extract - case title / name
    h4 = soup.find("h4", {"parties"})
    name = h4.get_text(strip=True) if h4 else None
    # Extract - attorneys
    attorneys = [
        tag.get_text(strip=True) for tag in soup.find_all("p",{"class", "attorneys"})
    ]
    # Extract - author
    author = soup.find("p",{"class","author"}).get_text(strip=True) if soup.find("p",{"class","author"}) else None
    
    return {
        "metadata":{
            "case_id": case_id,
            "case_name": name,
            "attorneys": attorneys,
            "author": author
        },
        "body_text": body_text
    }

def extract_data(file_list):
    data = []
    for file in file_list:
        if file.endswith('.html') or file.endswith('.htm'):
            with open(file, "r", encoding="utf-8") as f:
                html_content = f.read()
                data.append(extract_case_metadata(html_content)) 
                docs = create_lang_documents(data)
        else:
            print(f"Unsupported file type: {file}") 
    return docs

In [9]:
docs = extract_data(file_paths)

In [10]:
docs

[Document(metadata={'case_id': 'NOTALEPH000606_0001', 'case_name': 'The Peoplevs.Smith,et al.', 'attorneys': ['O. I). SempleandJohn B. Weller,for tbe applicants, and by', 'G. J. O.Keioen, (attorney general,) for tbe people.'], 'author': 'Bennett, J.'}, page_content="The People vs. Smith, et al. It is too late to raise an objection to an affidavit or warrant of arrest on a criminal charge, after the examination of the prisoner has been had, and it appears that there is probable cause to suppose that he is guilty of felony, and an order of commitment has been made by the committing magistrate. So held upon an application to discharge a prisoner on habeas corpus. If it appear on the examination of a person before a committing magistrate, that the prisoner is guilty of felony, although different from that specified in the warrant of arrest, it is the duty of the officer to commit the prisoner for trial, for the offence of which he appears to be guilty. If an order of commitment be sufficie

# Summarize Docs - Individual / Global

In [11]:
import copy
import operator
from typing import Annotated, List, TypedDict
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langgraph.types import Send
from langgraph.graph import END, START, StateGraph

In [17]:
async def summarize_docs(documents):
    new_docs = copy.deepcopy(documents)
    
    llm_model = OllamaLLM(
        model = "mistral"
    )
    
    map_template = "Write a concise summary based only on the context given below and not on your knowledge:\n{context}"
    reduce_template = """
The following is a set of summaries:
{summaries}
Take these and distill it into a final, consolidated summary of the main themes.
"""
    map_prompt = ChatPromptTemplate([("human"), map_template])
    reduce_prompt = ChatPromptTemplate([("human"), reduce_template])
    
    map_chain = map_prompt | llm_model | StrOutputParser()
    reduce_chain = reduce_prompt | llm_model | StrOutputParser()
    
    # overall state of the main graph - contains the input document contents,
    # corresponding summaries, and a final summary.
    class OverallState(TypedDict):
        contents: List[str]
        summaries: Annotated[list, operator.add]
        final_summary: str

    # state of the node that will "map" all documents in order to generate summaries
    class SummaryState(TypedDict):
        content: str
        
    async def generate_summary(state: SummaryState):
        response = await map_chain.ainvoke(state["content"])
        return {"summaries": [response]}

    def map_summaries(state: OverallState):
        return[
            Send("generate_summary", {"content": content}) for content in state["contents"]
        ]

    async def generate_final_summary(state: OverallState):
        response = await reduce_chain.ainvoke(state["summaries"])
        return {"final_summary": response}

    graph = StateGraph(OverallState)
    graph.add_node("generate_summary", generate_summary)
    graph.add_node("generate_final_summary", generate_final_summary)
    graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
    graph.add_edge("generate_summary", "generate_final_summary")
    graph.add_edge("generate_final_summary", END)
    app = graph.compile()
    
    results = await app.ainvoke({"contents": new_docs})
    for doc, summary in zip(new_docs, results['summaries']):
        doc.metadata['summary'] = summary
    new_docs.append(Document(metadata={'title':'final_summary'}, page_content=results['final_summary']))
    return new_docs

In [18]:
summarized_docs = await summarize_docs(docs)

In [19]:
print(summarized_docs)

[Document(metadata={'case_id': 'NOTALEPH000606_0001', 'case_name': 'The Peoplevs.Smith,et al.', 'attorneys': ['O. I). SempleandJohn B. Weller,for tbe applicants, and by', 'G. J. O.Keioen, (attorney general,) for tbe people.'], 'author': 'Bennett, J.', 'summary': ' The case "The People vs. Smith, et al." discusses the rules regarding arrest warrants, committing magistrates, and habeas corpus in California. It is established that an affidavit upon information, though not of high weight, can be used for a legal proceeding as long as it sets forth facts within the knowledge of the deponent. If a prisoner appears guilty of felony during examination before a committing magistrate, they must be committed for trial for the offence they appear guilty of, even if different from the one specified in the warrant of arrest.\n\nCourts can take judicial notice of their own government\'s jurisdiction and local divisions. Judges of First Instance have the authority to act as examining and committing ma