# **Developing a Strategic Legal Preparation Tool :**

# *Approach of Project* :

1. #### *Collect Data*: Obtain PDF of the legal document.
2. #### *Structure Document*: Split into {page, line, text}.
3. #### *Keyword Extraction*: Use TextRank / KeyBERT / TF-IDF to get important                                      keywords.
4. #### *Sentence Scoring*: Rank sentences based on keyword presence and importance.
5. #### *Argument Classification*: Tag sentences as For or Against.
6. #### *Reference Mapping: Attach* page and line numbers to each key sentence.
7. #### *Top 10 Selection*: Pick the most pivotal items, ensuring balanced perspective.
8. #### *Optional Summarization*: Abstractive summary for readability (keep original                                     sentences for reference).
9. #### *Output*: Present in table/JSON with keyword, sentence, page, line, and stance.

In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Obtaining dependency information for pdfplumber from https://files.pythonhosted.org/packages/db/e0/52b67d4f00e09e497aec4f71bc44d395605e8ebcea52543242ed34c25ef9/pdfplumber-0.11.7-py3-none-any.whl.metadata
  Using cached pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Obtaining dependency information for pdfminer.six==20250506 from https://files.pythonhosted.org/packages/73/16/7a432c0101fa87457e75cb12c879e1749c5870a786525e2e0f42871d6462/pdfminer_six-20250506-py3-none-any.whl.metadata
  Using cached pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Obtaining dependency information for pypdfium2>=4.18.0 from https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_6

In [None]:
!pip install keybert sentence-transformers


In [None]:
!pip install sentence-transformers

## *Import libraries  :*

In [None]:
import pdfplumber 
import re
import pandas as pd

from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download("punkt")

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import pipeline

from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import BartForConditionalGeneration, BartTokenizer

import json                
from tabulate import tabulate 

## Extract text from case :

In [None]:
file_path="http://localhost:8888/files/Assignment/Moto-AI/Alliance-Hippocratic-Medicine_2023.02.13_AMICUS-BRIEF-State-of-Mississippi-et-al.pdf"

In [None]:
all_data = []

with pdfplumber.open(file_path) as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        text = page.extract_text()
        if text:
            lines = text.split('\n')
            for line_num, line in enumerate(lines, start=1):
                all_data.append({
                    "Page": page_num,  # store page number
                    "Line": line_num,  # line number
                    "Text": line       # store single line
                })


## Convert data into Dataframe :

In [None]:
df=pd.DataFrame(all_data)

In [None]:
df.head(10)

## Keywords extraction :

In [None]:
full_text=" ".join(df["Text"].tolist())

In [None]:
kw_model=KeyBERT()
def extract_keywords(sent):
    kws = kw_model.extract_keywords(sent, keyphrase_ngram_range=(1,3), top_n=5)
    return [kw[0] for kw in kws]

In [None]:
keywords=extract_keywords(full_text)
print("Keywords extracted : \n",keywords)

## Sentence Tokenization : 

In [None]:
def clean_text(text):
    text = " ".join(text.split())
    return text.strip()

# Tokenize sentences
sentences = sent_tokenize(full_text)
sentence_data = []

for sent in sentences:
    sent_clean = clean_text(sent)
    matched = False
    for idx, row in df.iterrows():
        line_clean = clean_text(row["Text"])
        if sent_clean in line_clean or line_clean in sent_clean:
            sentence_data.append({
                "Sentence ": sent_clean,
                "Page ": row["Page"],
                "Line ": row["Line"]
            })
            matched = True
            break
    if not matched:
        sentence_data.append({
            "Sentence ": sent_clean,
            "Page ": None,
            "Line ": None
        })


## Convert sentence_data into Dataframe : 

In [None]:
sent_df=pd.DataFrame(sentence_data)

## Sentence scoring based on Keywords :

In [None]:
keyword_list=[kw[0].lower() for kw in keywords]

In [None]:
def score_sentences(sent):
    score=0
    for key in keyword_list:
        if key in sent :
            score+=1
    return score

In [None]:
sent_df["Score"]=sent_df["Sentence "].apply(score_sentences)

In [None]:
# Sorting in ascending order :
sent_df=sent_df.sort_values(by="Score",ascending=False)

## Argument Classification : 

In [None]:
import string

def argument_classification(sent):
    for_words = [
        "violate", "defy", "unlawful", "undermine", "harm", "oppose", "restrict",
        "protect", "safeguard", "defiance", "illegal", "contravene", "obstruction",
        "prohibited", "threaten", "contrary", "encroachment", "evade", "disregard",
        "jeopardize", "risk", "impose", "conflict", "unlawfulness", "infringement",
        "outweigh", "undermine public interest"
    ]
    
    against_words = [
        "access", "available", "expand", "promote", "ensure", "safely", "permitted",
        "authorized", "approved", "provide", "therapeutic benefit", "enforce discretion",
        "facilitate"
    ]

    # Lowercase and remove punctuation
    sent_clean = sent.lower().translate(str.maketrans('', '', string.punctuation))

    # Check for 'for' phrases/words
    for phrase in for_words:
        if phrase in sent_clean:
            return "For"

    # Check for 'against' phrases/words
    for phrase in against_words:
        if phrase in sent_clean:
            return "Against"

    return "Neutral"


In [None]:
sent_df["Argument"]=sent_df["Sentence "].apply(argument_classification)

In [None]:
sent_df["Argument"].unique()

In [None]:
print(sent_df.head(10))


## Top 10 Pivotal sentences : 

In [None]:
for_sentences=sent_df[sent_df["Argument"]=="For"].head(5)
against_sentences=sent_df[sent_df["Argument"]=="Against"].head(5)

top_10=pd.concat([for_sentences,against_sentences])
top_10=top_10[["Sentence ","Page ","Line ","Argument","Score"]]

print("Top 10 sentences : \n",top_10)

## Abstractive Summarization : 

In [None]:
import re

def clean_text_for_summary(text):
    # Remove case numbers
    text = re.sub(r'Case \d+:[\d\-]+ Document \d+ Filed \d+/\d+/\d+', '', text)
    # Remove page numbers
    text = re.sub(r'Page \d+ of \d+', '', text)
    # Remove all-uppercase words (likely headers or states)
    text = " ".join([line for line in text.split('.') if not line.isupper()])
    # Remove extra whitespace
    text = " ".join(text.split())
    return text

In [None]:
clean_text = clean_text_for_summary(full_text)
input_text = clean_text  # keep within model limits

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

def summarize_text(text, max_input_len=512, max_output_len=150):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", truncation=True, max_length=max_input_len)
    outputs = model.generate(
        inputs, 
        max_length=max_output_len, 
        min_length=40, 
        length_penalty=2.0, 
        num_beams=4, 
        early_stopping=True
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Summarize only the top 10 sentences
top_text = " ".join(top_10["Sentence "].tolist())
summary_text = summarize_text(top_text)


In [None]:
# Take top 50 scored sentences
top_sentences = sent_df.head(10)["Sentence "].tolist()
summary_input = " ".join(top_sentences)

# Summarize using T5
summary_text = summarize_text(summary_input, max_input_len=512, max_output_len=500)
print("Top 10 sentences Abstractive Summary:\n", summary_text)


## Converting into JSON Format :

In [None]:
top_10.head()

In [None]:
top_10["Keywords"] = top_10["Sentence "].apply(extract_keywords)


In [None]:
sent_df["Page "] = sent_df["Page "].apply(lambda x: x.page_number if hasattr(x, "page_number") else x)

In [None]:
import json

output = {
    "Top_10_Sentences": top_10.to_dict(orient="records"),
    "Summary": summary_text
}

# Pretty print
print(json.dumps(output, indent=2))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

df_vis = pd.DataFrame(output["Top_10_Sentences"])
sns.countplot(x="Argument", data=df_vis, palette="Set2")
plt.title("Distribution of Arguments (For vs Against)")
plt.show()

In [None]:
from collections import Counter
from wordcloud import WordCloud

keywords = [kw for item in df_vis["Keywords"] for kw in item]
word_freq = Counter(keywords)

wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Keyword Importance (WordCloud)")
plt.show()


In [None]:
sns.barplot(x="Score", y="Sentence ", data=df_vis, palette="Blues_d")
plt.title("Top 10 Sentences by Score")
plt.xlabel("Score")
plt.ylabel("Sentence")
plt.show()


In [None]:
sns.histplot(df_vis["Page "].dropna(), bins=10, kde=False)
plt.title("Distribution of Key Sentences Across Pages")
plt.xlabel("Page Number")
plt.ylabel("Count")
plt.show()


In [None]:
# Install if not available
# !pip install python-pptx

from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor

# Create presentation
prs = Presentation()
title_slide_layout = prs.slide_layouts[0]

# --- Slide 1: Title ---
slide = prs.slides.add_slide(title_slide_layout)
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = "Developing a Strategic Legal Preparation Tool"
subtitle.text = "AI & Machine Learning Project\nPrepared by: Uma Pravallika"

# --- Slide 2: Problem Statement ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Problem Statement"
content.text = (
    "In legal proceedings, attorneys need to analyze lengthy documents quickly.\n"
    "Our task: Develop an AI tool that extracts the Top 10 pivotal arguments (For & Against), "
    "with references to page and line numbers, to help attorneys prepare effectively."
)

# --- Slide 3: Approach Overview ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Approach of the Project"
content.text = (
    "1. Collect Data: Extract text from PDF using pdfplumber.\n"
    "2. Structure Document: Split into {page, line, text}.\n"
    "3. Keyword Extraction: Use KeyBERT / TF-IDF.\n"
    "4. Sentence Scoring: Rank sentences based on keywords.\n"
    "5. Argument Classification: Tag as For / Against.\n"
    "6. Reference Mapping: Attach page & line numbers.\n"
    "7. Top 10 Selection: Ensure balanced arguments.\n"
    "8. Summarization: Generate abstractive summary with T5.\n"
    "9. Output: Present in JSON/Table format."
)

# --- Slide 4: Data Processing ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Data Processing"
content.text = (
    "- Extracted text using pdfplumber.\n"
    "- Structured into rows with Page, Line, Text.\n"
    "- Applied cleaning & tokenization using NLTK.\n"
    "- Prepared text for keyword extraction and scoring."
)

# --- Slide 5: Keyword Extraction ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Keyword Extraction"
content.text = (
    "- Applied KeyBERT to extract top keywords.\n"
    "- Keywords highlight critical legal terms.\n"
    "- Used TF-IDF for scoring importance."
)

# --- Slide 6: Sentence Scoring & Classification ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Sentence Scoring & Classification"
content.text = (
    "- Scored sentences based on keyword frequency.\n"
    "- Classified as 'For' or 'Against' using rule-based matching.\n"
    "- Ensured balanced selection of 5 'For' and 5 'Against'."
)

# --- Slide 7: Top 10 Sentences Example ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Top 10 Key Sentences"
content.text = (
    "Example Output:\n"
    "• 'Under our Constitution, States have the primary authority to legislate...' (For)\n"
    "• 'In 2016, the FDA extended the approved use of mifepristone...' (Against)\n"
    "• Each entry includes Page & Line references."
)

# --- Slide 8: Abstractive Summary ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Abstractive Summary"
content.text = (
    "Generated summary using T5 model:\n"
    "“The agency relied on Subpart H when it first approved mifepristone in 2000. "
    "It required supervision by physicians, reflecting states' power to regulate healthcare.”"
)

# --- Slide 9: Output Format ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Final Output"
content.text = (
    "- JSON file with Top 10 sentences, arguments, keywords.\n"
    "- Table format for readability.\n"
    "- Includes page & line references for quick lookup."
)

# --- Slide 10: Screenshots & Visuals ---
slide = prs.slides.add_slide(prs.slide_layouts[5])  # Title only
title = slide.shapes.title
title.text = "Screenshots of Outputs"
# (You can manually insert images from Outputs.zip here after generating ppt)

# --- Slide 11: Conclusion & Future Scope ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
title, content = slide.shapes.title, slide.placeholders[1]
title.text = "Conclusion & Future Scope"
content.text = (
    "- Automated legal document summarization saves time.\n"
    "- Helps attorneys focus on key case aspects.\n"
    "- Future scope: Use GPT/LLMs for deeper contextual analysis, "
    "multi-document summarization, and interactive dashboards."
)

# Save presentation
prs.save("Strategic_Legal_Preparation_Tool_Final.pptx")
print("✅ PPT Generated: Strategic_Legal_Preparation_Tool_Final.pptx")
