In [6]:
GOOGLE_API_KEY = "AIzaSyAnOLAWbG1Pipm_fh8fRTVIxoOSOFtWV0I"

In [7]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import Optional, List

In [8]:
class DataLoader:
    def __init__(self, file_path: str):
        self.path = file_path

    def _check_file_type(self):
        return self.path.split(".")[-1].lower()

    def load(self):
        ext = self._check_file_type()

        if ext == "pdf":
            loader = PyPDFLoader(self.path)
        elif ext == "txt":
            loader = TextLoader(self.path, encoding="utf-8")
        else:
            raise ValueError(f"Unsupported file type: {ext}")

        return loader.load()


In [9]:
import uuid
from datetime import datetime

class DocumentManager:
    def __init__(self):
        self.store = {}

    def create_document(self, file_path: str):
        document_id = str(uuid.uuid4())
        self.store[document_id] = {
            "file_path": file_path,
            "created_at": datetime..now(datetime.UTC),
            "summary": None,
            "risk_flag": None,
            "chunk_summaries": [],
        }
        return document_id

    def save_summary(self, document_id, summary, risk_flag, chunk_summaries):
        self.store[document_id]["summary"] = summary
        self.store[document_id]["risk_flag"] = risk_flag
        self.store[document_id]["chunk_summaries"] = chunk_summaries

    def get_document(self, document_id):
        return self.store.get(document_id)


In [10]:
class TextSplitter:
    def __init__(self, text: str):
        self.text = text

    def summary_split(self):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=2500,
            chunk_overlap=150
        )
        return splitter.split_text(self.text)

    def embed_split(self):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size= 1200,
            chunk_overlap=150
        )
        return splitter.split_documents(self.text)


In [11]:
from pydantic import BaseModel, Field
from typing import Optional

class ChunkLegalResponse(BaseModel):
    Summary: str = Field(
        description="Detailed summary of this section"
    )
    Flag: Optional[str] = Field(
        default=None,
        description="Risky clause if present"
    )


class FinalLegalResponse(BaseModel):
    Summary: str = Field(
        min_length=300,
        description="Full multi-paragraph legal summary"
    )
    Flag: Optional[str] = Field(
        description="Most significant risky clause for bearer"
    )


In [12]:
class ChatModel:
    def __init__(self, api_key: str):
        self.api = api_key

    def get_chat_model(self):
        return ChatGoogleGenerativeAI(
            model="gemini-2.5-pro", 
            google_api_key=self.api,
            temperature=0.4,
            max_tokens=2000
        )

    def get_embed_model(self):
        return GoogleGenerativeAIEmbeddings(
            model = "gemini-embedding-001",
            api_key=self.api,
            output_dimensionality=512
        )


In [13]:
from langchain_core.prompts import ChatPromptTemplate

chunk_prompt = ChatPromptTemplate.from_messages([
    ("system", """
You are a legal document analysis assistant.

Analyze ONLY the provided text segment.

Tasks:
1. Produce a detailed legal summary of this segment.
2. Identify any clause that may be risky or harmful to the bearer.

Rules:
- Do not infer beyond the given text.
- If no risky clause exists, explicitly say so.
- Preserve exact legal language where relevant.
"""),
    ("human", "{text}")
])


In [14]:
merge_prompt = ChatPromptTemplate.from_messages([
    ("system", """
You are a senior legal analyst.

You are given multiple legal summaries and identified risks
from different sections of the same document.

Tasks:
1. Produce a cohesive, multi-paragraph legal summary of the ENTIRE document.
2. Identify the SINGLE most significant risky clause affecting the bearer.
3. If multiple risks exist, choose the most severe.
4. If no risks exist, state this clearly.

Return output strictly in the required JSON format.
"""),
    ("human", "{text}")
])


In [15]:
def cap_chunks(chunks: List[str], max_chunks: int = 9) -> List[str]:
    """
    Limit the number of chunks to at most max_chunks
    by evenly sampling across the document.
    """
    if len(chunks) <= max_chunks:
        return chunks

    step = max(1, len(chunks) // max_chunks)
    return [chunks[i] for i in range(0, len(chunks), step)][:max_chunks]


def truncate_text(text: str, max_chars: int = 8000) -> str:
    """
    Prevent silent LLM failures by bounding input size.
    """
    if len(text) <= max_chars:
        return text
    return text[:max_chars]


In [17]:
class LegalSummarizer:
    def __init__(self, model, text: str,document_id: str):
        self.base_model = model
        self.text = text
        self.document_id = document_id

    def summarize(self) -> FinalLegalResponse:
        chunks = TextSplitter(self.text).summary_split()

        chunks = cap_chunks(chunks, max_chunks=9)

        chunk_model = self.base_model.with_structured_output(ChunkLegalResponse)

        chunk_results = []
        for chunk in chunks:
            result = (chunk_prompt | chunk_model).invoke(
                {"text": chunk}
            )
            chunk_results.append(result)

        chunk_summaries = []

        for r in chunk_results:
            entry = {
                "summary": r.Summary
                }
            if r.Flag:  
                entry["risk"] = r.Flag
            chunk_summaries.append(entry)

        merged_input = "\n\n".join(
        f"SUMMARY:\n{r.Summary}\nRISK:\n{r.Flag or 'None'}"
        for r in chunk_results
        )

        merged_input = truncate_text(merged_input, max_chars=8000)


        final_model = self.base_model.with_structured_output(FinalLegalResponse)

        final_result = (merge_prompt | final_model).invoke(
            {"text": merged_input}
        )

        return {
            "document_id": self.document_id,
            "final_summary": final_result.Summary,
            "risk_flag": final_result.Flag,
            "chunk_summaries": chunk_summaries
            }



In [18]:
doc_manager = DocumentManager()

file_path = "sample-doc/sample_rent_doc.txt"

# Create document identity
document_id = doc_manager.create_document(file_path)

docs = DataLoader(file_path).load()
text = "\n".join(doc.page_content for doc in docs)

model = ChatModel(api_key=GOOGLE_API_KEY).get_chat_model()

summarizer = LegalSummarizer(model, text, document_id)
result = summarizer.summarize()

# Save results
doc_manager.save_summary(
    document_id,
    result["final_summary"],
    result["risk_flag"],
    result["chunk_summaries"]
)

print("Document ID:", document_id)
print("\n===== FINAL SUMMARY =====\n")
print(result["final_summary"])
print("\n===== RISK FLAG =====\n")
print(result["risk_flag"])


  "created_at": datetime.utcnow(),


Document ID: f4b05252-0b6c-4c18-8d02-c7977322347f

===== FINAL SUMMARY =====

This document is a Deed of Lease, a formal agreement establishing a landlord-tenant relationship between a Lessor and a Lessee for a specified term of years. The agreement outlines the Lessee's primary obligation to pay a fixed monthly ground rent in advance by the 5th of each month. Failure to pay on time incurs an interest penalty, although payment of this interest does not cure the default, leaving the Lessee vulnerable to further action by the Lessor.

The Lessee assumes significant financial responsibilities beyond rent. They are required to bear, pay, and discharge all existing and future rates, taxes, assessments, and other impositions on the property, effectively indemnifying the Lessor against all such liabilities. In terms of property use, the Lessee is granted the right to demolish existing structures and construct new buildings, subject to authority approvals, and can use the premises for any lawf

In [24]:
print(result["chunk_summaries"])

[{'summary': "The text defines a lease agreement as a contract between a landlord (lessor) and a tenant (lessee). It specifies that the agreement outlines the monthly rent, the duration of the lease, and details the obligations and responsibilities of both parties. The document is titled 'Deed of Lease (for a Term of Years) Rent Agreement'."}, {'summary': 'This section of a Deed of Lease establishes an agreement between a Lessor (A) and a Lessee (B) for the lease of land and premises described in a schedule. The lease term is for a specified number of years, commencing on the 1st day of a month in 2000. The Lessee is obligated to pay a monthly ground rent of a specified amount (Rs. ...), free of deductions and in advance, on or before the 5th day of each calendar month. The Lessee covenants to pay this rent on time. If the rent is not paid by the due date, the Lessee is required to pay interest at a specified percentage per annum from the due date until payment. The agreement explicitl