In [1]:
GOOGLE_API_KEY = "AIzaSyBTES_Rk5N14ctPN4JJTspiHfccfVO42xA"

In [2]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import Optional, List

In [3]:
class DataLoader:
    def __init__(self, file_path: str):
        self.path = file_path

    def _check_file_type(self):
        return self.path.split(".")[-1].lower()

    def load(self):
        ext = self._check_file_type()

        if ext == "pdf":
            loader = PyPDFLoader(self.path)
        elif ext == "txt":
            loader = TextLoader(self.path, encoding="utf-8")
        else:
            raise ValueError(f"Unsupported file type: {ext}")

        return loader.load()


In [4]:
class TextSplitter:
    def __init__(self, text: str):
        self.text = text

    def summary_split(self):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=2500,
            chunk_overlap=150
        )
        return splitter.split_text(self.text)

    def embed_split(self):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size= 1200,
            chunk_overlap=150
        )
        return splitter.split_documents(self.text)


In [5]:
from pydantic import BaseModel, Field
from typing import Optional

class ChunkLegalResponse(BaseModel):
    Summary: str = Field(
        description="Detailed summary of this section"
    )
    Flag: Optional[str] = Field(
        default=None,
        description="Risky clause if present"
    )


class FinalLegalResponse(BaseModel):
    Summary: str = Field(
        min_length=300,
        description="Full multi-paragraph legal summary"
    )
    Flag: Optional[str] = Field(
        description="Most significant risky clause for bearer"
    )


In [6]:
class ChatModel:
    def __init__(self, api_key: str):
        self.api = api_key

    def get_chat_model(self):
        return ChatGoogleGenerativeAI(
            model="gemini-2.5-pro", 
            google_api_key=self.api,
            temperature=0.4,
            max_tokens=2000
        )

    def get_embed_model(self):
        return GoogleGenerativeAIEmbeddings(
            model = "gemini-embedding-001",
            api_key=self.api,
            output_dimensionality=512
        )


In [7]:
from langchain_core.prompts import ChatPromptTemplate

chunk_prompt = ChatPromptTemplate.from_messages([
    ("system", """
You are a legal document analysis assistant.

Analyze ONLY the provided text segment.

Tasks:
1. Produce a detailed legal summary of this segment.
2. Identify any clause that may be risky or harmful to the bearer.

Rules:
- Do not infer beyond the given text.
- If no risky clause exists, explicitly say so.
- Preserve exact legal language where relevant.
"""),
    ("human", "{text}")
])


In [8]:
merge_prompt = ChatPromptTemplate.from_messages([
    ("system", """
You are a senior legal analyst.

You are given multiple legal summaries and identified risks
from different sections of the same document.

Tasks:
1. Produce a cohesive, multi-paragraph legal summary of the ENTIRE document.
2. Identify the SINGLE most significant risky clause affecting the bearer.
3. If multiple risks exist, choose the most severe.
4. If no risks exist, state this clearly.

Return output strictly in the required JSON format.
"""),
    ("human", "{text}")
])


In [9]:
def cap_chunks(chunks: List[str], max_chunks: int = 9) -> List[str]:
    """
    Limit the number of chunks to at most max_chunks
    by evenly sampling across the document.
    """
    if len(chunks) <= max_chunks:
        return chunks

    step = max(1, len(chunks) // max_chunks)
    return [chunks[i] for i in range(0, len(chunks), step)][:max_chunks]


def truncate_text(text: str, max_chars: int = 8000) -> str:
    """
    Prevent silent LLM failures by bounding input size.
    """
    if len(text) <= max_chars:
        return text
    return text[:max_chars]


In [10]:
class LegalSummarizer:
    def __init__(self, model, text: str):
        self.base_model = model
        self.text = text

    def summarize(self) -> FinalLegalResponse:
        chunks = TextSplitter(self.text).summary_split()

        chunks = cap_chunks(chunks, max_chunks=9)

        chunk_model = self.base_model.with_structured_output(ChunkLegalResponse)

        chunk_results = []
        for chunk in chunks:
            result = (chunk_prompt | chunk_model).invoke(
                {"text": chunk}
            )
            chunk_results.append(result)

        merged_input = "\n\n".join(
        f"SUMMARY:\n{r.Summary}\nRISK:\n{r.Flag or 'None'}"
        for r in chunk_results
        )

        merged_input = truncate_text(merged_input, max_chars=8000)


        final_model = self.base_model.with_structured_output(FinalLegalResponse)

        final_result = (merge_prompt | final_model).invoke(
            {"text": merged_input}
        )

        return final_result


In [12]:

docs = DataLoader("sample-doc/sample_rent_doc.txt").load()

text = "\n".join(doc.page_content for doc in docs)
text


model = ChatModel(api_key=GOOGLE_API_KEY).get_chat_model()

summarizer = LegalSummarizer(model, text)
result = summarizer.summarize()

print("\n===== FINAL SUMMARY =====\n")
print(result.Summary)

print("\n===== RISK FLAG =====\n")
print(result.Flag)



===== FINAL SUMMARY =====

This document outlines a Deed of Lease, a contractual agreement between a Lessor (property owner) and a Lessee (tenant) for a specified property. The lease is established for a multi-year term commencing in 2000, in exchange for a fixed monthly ground rent. The payment terms are strict, requiring the Lessee to pay the rent in advance by the 5th of each month, free of all deductions. Failure to pay on time incurs an interest penalty, though payment of this interest does not formally excuse the default, leaving the Lessee vulnerable to further remedies by the Lessor.

The agreement places a significant number of obligations on the Lessee. Beyond rent, the Lessee is responsible for bearing all existing and future rates, taxes, and assessments on the property, and must indemnify the Lessor against these costs. While the Lessee is required to keep the property in good repair, they are granted the right to demolish existing structures and erect new ones, provided 

In [13]:
import uuid
memory_store = {}

In [None]:


class ThreadManager:
    def __init__(self):
        self.store = {}

    def create_thread(self):
        thread_id = str(uuid.uuid4())
        self.store[thread_id] = []
        return thread_id

    def add_message(self, thread_id, role, content):
        self.store[thread_id].append({
            "role": role,
            "content": content
        })

    def get_history(self, thread_id):
        return self.store.get(thread_id, [])


In [21]:
uuid.uuid4()

UUID('ce724b07-b7cd-4813-826a-86945b9db4ba')