<a href="https://colab.research.google.com/github/Sneha-bhat24/Legal_Analyser/blob/main/DataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.3/302.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.4.0


In [None]:
!pip install summa

Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: summa
  Building wheel for summa (setup.py) ... [?25l[?25hdone
  Created wheel for summa: filename=summa-1.2.0-py3-none-any.whl size=54387 sha256=85a60fd936894ffd70eba8ffe6f2f059fdc4144f3e6837ef3df218c177421534
  Stored in directory: /root/.cache/pip/wheels/10/2d/7a/abce87c4ea233f8dcca0d99b740ac0257eced1f99a124a0e1f
Successfully built summa
Installing collected packages: summa
Successfully installed summa-1.2.0


In [None]:
import os
import json
import spacy
import fitz  # PyMuPDF
from summa import summarizer  # TextRank-based Summarization
import re

# ✅ Load spaCy's English NLP Model
nlp = spacy.load("en_core_web_sm")

# 📜 Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text.strip()
    except Exception as e:
        print(f"❌ Error extracting text from {pdf_path}: {e}")
        return ""

# 🎯 Function to extract structured legal information
def extract_legal_info(text):
    doc = nlp(text)

    # 🔹 Extract Key Entities
    petitioner = respondent = court_name = bench = ""
    lawyer_names = []
    legal_sections = set()
    case_sentences = []
    judgment_sentences = []
    arguments = []
    final_order = ""

    # Define Keywords for Extraction
    keywords_case = ["case", "matter", "issue", "facts", "trial", "accused", "plaintiff"]
    keywords_judgment = ["judgment", "decision", "court held", "order", "verdict", "ruling"]
    keywords_arguments = ["argued", "submitted", "contention", "pleaded", "contended"]
    keywords_final_order = ["hence", "thus", "therefore", "the court rules", "the court orders", "convicted", "acquitted"]

    for sent in doc.sents:
        sent_text = sent.text.lower()

        # Extract Petitioner & Respondent (Basic Heuristics)
        if "versus" in sent_text or "vs." in sent_text:
            parts = sent.text.split("versus" if "versus" in sent_text else "vs.")
            if len(parts) == 2:
                petitioner = parts[0].strip()
                respondent = parts[1].strip()

        # Identify Court & Bench
        if "in the supreme court" in sent_text or "in the high court" in sent_text:
            court_name = sent.text.strip()
        if "coram:" in sent_text or "bench:" in sent_text:
            bench = sent.text.strip()

        # Extract Lawyer Names (Basic Pattern Matching)
        if "appeared for" in sent_text or "represented by" in sent_text:
            lawyer_names.append(sent.text.strip())

        # Extract Legal Sections & Acts
        section_match = re.findall(r"\b(?:Section|Sec)\s\d+\b", sent.text)
        act_match = re.findall(r"\b[A-Z][a-z]+ Act\b", sent.text)
        legal_sections.update(section_match + act_match)

        # Extract Case Summary
        if any(keyword in sent_text for keyword in keywords_case):
            case_sentences.append(sent.text)

        # Extract Judgment Summary
        if any(keyword in sent_text for keyword in keywords_judgment):
            judgment_sentences.append(sent.text)

        # Extract Key Arguments
        if any(keyword in sent_text for keyword in keywords_arguments):
            arguments.append(sent.text)

        # Extract Final Order
        if any(keyword in sent_text for keyword in keywords_final_order):
            final_order = sent.text.strip()

    # 🔹 Summarize Extracted Sections
    case_summary = summarizer.summarize(" ".join(case_sentences), ratio=0.3) if case_sentences else "N/A"
    judgment_summary = summarizer.summarize(" ".join(judgment_sentences), ratio=0.3) if judgment_sentences else "N/A"
    argument_summary = summarizer.summarize(" ".join(arguments), ratio=0.3) if arguments else "N/A"

    return {
        "Petitioner": petitioner,
        "Respondent": respondent,
        "Court Name": court_name,
        "Bench": bench,
        "Lawyers": lawyer_names,
        "Legal Sections Referred": list(legal_sections),
        "Case Summary": case_summary,
        "Judgment Summary": judgment_summary,
        "Key Arguments": argument_summary,
        "Final Order": final_order
    }

# 🔍 Process PDFs in Nested Folders & Extract Summaries
def process_pdfs_in_folder(folder_path, output_json="legal_case_data.json", batch_size=5):
    extracted_data = []
    pdf_files = []

    # Walk through all subdirectories
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))

    for i in range(0, len(pdf_files), batch_size):
        batch = pdf_files[i:i + batch_size]

        for pdf_path in batch:
            pdf_name = os.path.basename(pdf_path)
            print(f"🚀 Processing: {pdf_name}")

            text = extract_text_from_pdf(pdf_path)
            if not text:
                print(f"⚠️ No text extracted from {pdf_name}. Skipping.")
                continue

            case_info = extract_legal_info(text)
            extracted_data.append({"pdf_name": pdf_name, "case_info": case_info})

    # Save extracted data as JSON
    output_path = "/content/drive/MyDrive/" + output_json
    with open(output_path, "w") as f:
        json.dump(extracted_data, f, indent=4)

    print(f"\n✅ All extractions completed! Data saved to {output_json} 🎉")

# 🏁 Run Extraction on a Given Folder
folder_path = "/content/drive/MyDrive/supreme/judgments_ai"  # Change this to your actual folder
process_pdfs_in_folder(folder_path)


🚀 Processing: 2022-06-15_6925_2021.pdf
🚀 Processing: 2022-06-16_19953_2019.pdf
🚀 Processing: 2022-06-01_16486_2022.pdf
🚀 Processing: 2022-06-16_24373_2014.pdf
🚀 Processing: 2022-06-22_31728_2021.pdf
🚀 Processing: 2022-06-10_16466_2022.pdf
🚀 Processing: 2022-06-01_23538_2011.pdf
🚀 Processing: 2022-06-03_2997_1995.pdf
🚀 Processing: 2022-06-03_16718_2022.pdf
🚀 Processing: 2022-06-03_5047_2022.pdf
🚀 Processing: 2022-06-16_13338_2019.pdf
🚀 Processing: 2022-06-22_23877_2021.pdf
🚀 Processing: 2022-06-02_35509_2011.pdf
🚀 Processing: 2022-06-24_19978_2015.pdf
🚀 Processing: 2022-06-10_15293_2022.pdf
🚀 Processing: 2022-06-03_30056_2019.pdf
🚀 Processing: 2022-06-27_18702_2022.pdf
🚀 Processing: 2022-06-13_15653_2009.pdf
🚀 Processing: 2022-06-16_31493_2016.pdf
🚀 Processing: 2022-06-14_18235_2011.pdf
🚀 Processing: 2022-06-13_15539_2007.pdf
🚀 Processing: 2022-06-09_17274_2022.pdf
🚀 Processing: 2022-06-14_3238_2022.pdf
🚀 Processing: 2022-06-24_34207_2018.pdf
🚀 Processing: 2022-09-19_26049_2022.pdf
🚀 Pr

In [None]:
!pip install summa # installing the summa module
import os
import json
import spacy
import fitz  # PyMuPDF
from summa import summarizer  # TextRank-based Summarization
import re

Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: summa
  Building wheel for summa (setup.py) ... [?25l[?25hdone
  Created wheel for summa: filename=summa-1.2.0-py3-none-any.whl size=54387 sha256=54806399c708197044b65d6f6f1f219bae2834405068016fade0b50c81ade0a6
  Stored in directory: /root/.cache/pip/wheels/10/2d/7a/abce87c4ea233f8dcca0d99b740ac0257eced1f99a124a0e1f
Successfully built summa
Installing collected packages: summa
Successfully installed summa-1.2.0


In [None]:


# ✅ Load spaCy's English NLP Model
nlp = spacy.load("en_core_web_sm")

# 📜 Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])
        return text.strip()
    except Exception as e:
        print(f"❌ Error extracting text from {pdf_path}: {e}")
        return ""

# 🎯 Function to extract structured legal information
def extract_legal_info(text):
    doc = nlp(text)

    # 🔹 Extract Key Entities
    petitioner = respondent = court_name = bench = ""
    lawyer_names = []
    legal_sections = set()
    case_sentences = []
    judgment_sentences = []
    arguments = []
    final_order = ""

    # Define Keywords for Extraction
    keywords_case = ["case", "matter", "issue", "facts", "trial", "accused", "plaintiff"]
    keywords_judgment = ["judgment", "decision", "court held", "order", "verdict", "ruling"]
    keywords_arguments = ["argued", "submitted", "contention", "pleaded", "contended"]
    keywords_final_order = ["hence", "thus", "therefore", "the court rules", "the court orders", "convicted", "acquitted"]

    for sent in doc.sents:
        sent_text = sent.text.lower()

        # Extract Petitioner & Respondent (Basic Heuristics)
        if "versus" in sent_text or "vs." in sent_text:
            parts = sent.text.split("versus" if "versus" in sent_text else "vs.")
            if len(parts) == 2:
                petitioner = parts[0].strip()
                respondent = parts[1].strip()

        # Identify Court & Bench
        if "in the supreme court" in sent_text or "in the high court" in sent_text:
            court_name = sent.text.strip()
        if "coram:" in sent_text or "bench:" in sent_text:
            bench = sent.text.strip()

        # Extract Lawyer Names (Basic Pattern Matching)
        if "appeared for" in sent_text or "represented by" in sent_text:
            lawyer_names.append(sent.text.strip())

        # Extract Legal Sections & Acts
        section_match = re.findall(r"\b(?:Section|Sec)\s\d+\b", sent.text)
        act_match = re.findall(r"\b[A-Z][a-z]+ Act\b", sent.text)
        legal_sections.update(section_match + act_match)

        # Extract Case Summary
        if any(keyword in sent_text for keyword in keywords_case):
            case_sentences.append(sent.text)

        # Extract Judgment Summary
        if any(keyword in sent_text for keyword in keywords_judgment):
            judgment_sentences.append(sent.text)

        # Extract Key Arguments
        if any(keyword in sent_text for keyword in keywords_arguments):
            arguments.append(sent.text)

        # Extract Final Order
        if any(keyword in sent_text for keyword in keywords_final_order):
            final_order = sent.text.strip()

    # 🔹 Summarize Extracted Sections
    case_summary = summarizer.summarize(" ".join(case_sentences), ratio=0.3) if case_sentences else "N/A"
    judgment_summary = summarizer.summarize(" ".join(judgment_sentences), ratio=0.3) if judgment_sentences else "N/A"
    argument_summary = summarizer.summarize(" ".join(arguments), ratio=0.3) if arguments else "N/A"

    return {
        "Petitioner": petitioner,
        "Respondent": respondent,
        "Court Name": court_name,
        "Bench": bench,
        "Lawyers": lawyer_names,
        "Legal Sections Referred": list(legal_sections),
        "Case Summary": case_summary,
        "Judgment Summary": judgment_summary,
        "Key Arguments": argument_summary,
        "Final Order": final_order
    }

# 🔍 Process PDFs in Nested Folders & Extract Summaries
def process_pdfs_in_folder(folder_path, output_json="legal_case_data.json", batch_size=5):
    extracted_data = []
    pdf_files = []

    # Walk through all subdirectories
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))

    for i in range(0, len(pdf_files), batch_size):
        batch = pdf_files[i:i + batch_size]

        for pdf_path in batch:
            pdf_name = os.path.basename(pdf_path)
            print(f"🚀 Processing: {pdf_name}")

            text = extract_text_from_pdf(pdf_path)
            if not text:
                print(f"⚠️ No text extracted from {pdf_name}. Skipping.")
                continue

            case_info = extract_legal_info(text)
            extracted_data.append({"pdf_name": pdf_name, "case_info": case_info})

    # Save extracted data as JSON
    output_path = "/content/drive/MyDrive/" + output_json
    with open(output_path, "w") as f:
        json.dump(extracted_data, f, indent=4)

    print(f"\n✅ All extractions completed! Data saved to {output_json} 🎉")

# 🏁 Run Extraction on a Given Folder
folder_path = "/content/drive/MyDrive/HIGH"  # Change this to your actual folder
process_pdfs_in_folder(folder_path)


🚀 Processing: case1.pdf
🚀 Processing: case2.pdf
🚀 Processing: case3.pdf
🚀 Processing: case4.pdf
🚀 Processing: case5.pdf
🚀 Processing: case6.pdf
🚀 Processing: case7.pdf
🚀 Processing: case8.pdf
🚀 Processing: case9.pdf

✅ All extractions completed! Data saved to legal_case_data.json 🎉
