In [349]:
import re
import spacy
import pdfplumber
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
import google.generativeai as genai
import json


In [350]:
nlp = spacy.load("en_core_web_sm")

In [351]:
def load_api_keys(path="key.txt"):
    with open(path, "r") as f:
        return [line.strip() for line in f if line.strip()]

class GeminiKeyManager:
    def __init__(self, path="key.txt"):
        self.keys = load_api_keys(path)
        self.index = 0
        self.configure_client()

    def configure_client(self):
        genai.configure(api_key=self.keys[self.index])
        self.model = genai.GenerativeModel("gemini-1.5-flash")

    def rotate_key(self):
        self.index = (self.index + 1) % len(self.keys)
        self.configure_client()
        print(f"Switched to API key #{self.index+1}")

    def safe_generate(self, prompt):
        try:
            response = self.model.generate_content(prompt)
            return response
        except Exception as e:
            print(f"Error with key #{self.index+1}: {e}")
            self.rotate_key()
            return self.model.generate_content(prompt)

gemini_keys = GeminiKeyManager()


In [352]:
def extract_text(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            extracted = page.extract_text()
            if extracted:
                text += extracted + "\n"
    return text

In [353]:
def summarize_document(text, sentence_count=7):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary_sentences = summarizer(parser.document, sentence_count)
    return "\n".join(str(s) for s in summary_sentences)

In [354]:
def extract_dates(text):
    date_patterns = [
        
        r"\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b",       # 12/05/2021, 12-05-21, 12.05.2021
        r"\b\d{4}[/.-]\d{1,2}[/.-]\d{1,2}\b",         # 2021-05-12, 2021.07.19

        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{2,4}\b",  # July 19, 2025
        r"\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*,?\s+\d{2,4}\b",  # 19 July 2025

        r"\b\d+\s+(days?|weeks?|months?|years?)\b",   # 7 months, 10 years
        r"\b[\(\[\{<\-*]?\d+[\)\]\}>\-*]?\s+(days?|weeks?|months?|years?)\b",  # (3) years

        r"\b(one|two|three|four|five|six|seven|eight|nine|ten|twelve)\s+(days?|weeks?|months?|years?)\b",
    ]

    contextual_dates = []
    for pattern in date_patterns:
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            date_val = match.group()

            # Get context around the match
            start = max(0, match.start() - 50)
            end = min(len(text), match.end() + 50)
            context_snippet = text[start:end].replace("\n", " ").strip()

            contextual_dates.append(f"<b>{context_snippet}:</b> {date_val}")

    # Deduplicate while keeping order
    seen = set()
    contextual_dates = [x for x in contextual_dates if not (x in seen or seen.add(x))]

    return contextual_dates if contextual_dates else ["Not specified"]


In [355]:

def extract_entities(text):
    doc = nlp(text)

    entities = {
        "Parties": [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON"]],
        "Dates": extract_dates(text),   # Now calling the separate function
        "Money": [ent.text for ent in doc.ents if ent.label_ == "MONEY"],
        "Obligations": re.findall(
            r"\b(shall|must|agree to|responsible for)\b.*?\.",
            text,
            flags=re.IGNORECASE,
        ),
    }
    return entities


In [356]:
def analyze_risks(text):
    risks = []
    risk_keywords = {
        "Late Payments / Financial Penalties": r"late fee|penalt(y|ies)|interest|default in payment|overdue|delayed payment",
        "Termination & Breach": r"termination|breach of contract|void|cancellation|material breach|contract violation",
        "Confidentiality & Disclosure": r"disclosure|confidential|nda|non-disclosure|trade secret",
        "Liability & Indemnification": r"liable|indemnif(y|ication)|responsibility|hold harmless|compensation liability",
        "Intellectual Property Risks": r"intellectual property|copyright|patent|trademark|IP rights|proprietary|design rights",
        "Dispute Resolution (Indian Context)": r"arbitration|jurisdiction|venue|dispute resolution|litigation|mediation|conciliation",
        "Damages & Remedies": r"damages|compensation|losses|remedies|consequential damages|punitive damages|specific performance",
        "Defamation / Reputation": r"defamation|reputation|libel|slander|character assassination",
        "Automatic Renewal": r"auto-?renewal|automatic renewal|renewal term|extension of contract",
        "Legal Fees & Costs": r"advocate fees|legal fees|court costs|litigation expenses",
        "Employment / Labour Risks": r"non-compete|non compete|non-solicit|non solicit|employee|employment|labour|industrial dispute",
        "Governing Law / Jurisdiction (India)": r"governing law|laws of India|Indian Penal Code|IPC|jurisdiction of.*India|court of.*India|Supreme Court|High Court|Arbitration and Conciliation Act",
    }
    for risk, pattern in risk_keywords.items():
        if re.search(pattern, text, flags=re.IGNORECASE):
            risks.append(risk)

    return risks if risks else ["No obvious risks detected"]


In [357]:
def enhance_with_gemini(summary, entities, risks):
    prompt = f"""
    You are a legal assistant.

    Here are extracted details from a legal document:

    --- Summary ---
    {summary}

    --- Entities ---
    {entities}

    --- Risks ---
    {risks}

    Task:
    1. Fix grammar and improve readability of the summary and other entities.
    2. Add a "suggestion" field (1–3 sentences of advice).
    3. Return a single JSON with keys:
       - summary
       - parties
       - date/time(
           mention context of a specific date/time with format "<b>context:</b> Value"
           example: 
                <b>Age of the cat:</b> 7 months
                <b>Date of the incident:</b> 2022-01-01
                <b>Document valid until:</b> 2023-01-01
            if any number is not a valid date with context unrelated to date/time, do not include it
            with each date/time on a new line, separated by <br/> inside the JSON string
            )
       - money/penalties
       - obligations
       - risks (string with multiple lines. Each risk must be formatted as:  
            "<b>• Risk Name</b>: Risk description(1line) asper Indian law" 
            with each risk on a new line, separated by <br/> inside the JSON string)
       - suggestion
    ONLY return valid JSON. Do not include triple backticks or any extra text.
    the entities (parties, date/time, money/penalties, obligations) as well as risks should return plain text, not inside [].
    if data is not available for any of the fields,return few words that might be relevant insted of none.
    DO NOT return any confidential information like party phone numbers, addresses(email can be returned as it necessary for further communication).
    """
    response = gemini_keys.safe_generate(prompt)
    result = response.text.strip()


    if result.startswith("```"):
        result = re.sub(r"^```(?:json)?", "", result.strip(), flags=re.IGNORECASE).strip()
        result = re.sub(r"```$", "", result).strip()

    return result 


In [358]:
def save_json_to_pdf(json_str, filename="legal_analysis.pdf"):
    try:
        data = json.loads(json_str)
    except:
        print("Gemini response not valid JSON. Saving raw text.")
        data = {"raw_output": json_str}

    doc = SimpleDocTemplate(filename)
    styles = getSampleStyleSheet()
    flow = [Paragraph("AI Legal Document Analysis", styles["Title"]), Spacer(1, 20)]

    for key, value in data.items():
        flow.append(Paragraph(f"<b>{key.title()}:</b>", styles["Heading3"]))
        flow.append(Spacer(1, 6))
        flow.append(Paragraph(str(value), styles["Normal"]))
        flow.append(Spacer(1, 12))

    doc.build(flow)
    print(f"Saved: {filename}")


In [359]:
def process_documents(pdf_files):
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file}\n{'-'*50}")
        text = extract_text(pdf_file)

        summary = summarize_document(text)
        entities = extract_entities(text)
        risks = analyze_risks(text)

        print("Offline Summary/Entities/Risks ready.")

        json_result = enhance_with_gemini(summary, entities, risks)
        print("Gemini JSON Result:\n", json_result[:400], "...\n")

        save_json_to_pdf(json_result, filename=f"output_docs\\{pdf_file[12:-4]}_analysis.pdf")


In [360]:
pdf_files = [r"input_docs\\sample_legal_document.pdf"] 
process_documents(pdf_files)


Processing: input_docs\\sample_legal_document.pdf
--------------------------------------------------
Offline Summary/Entities/Risks ready.
Gemini JSON Result:
 {
  "summary": "This Non-Disclosure Agreement (NDA) is between Party A and Party B.  Its purpose is to protect confidential information, including trade secrets, financial data, and technical processes, from disclosure to third parties without prior written consent.  Confidentiality obligations last for three years from the date of disclosure.  Breach of the agreement may result in legal remedies, ...

Saved: output_docs\sample_legal_document_analysis.pdf
