In [1]:
import json
import re
from transformers import pipeline

# Load the judgment text file
file_path = "hin_judgement.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

# Extract case details
case_details = {}

case_name_match = re.search(r"(.*?)\s+v\.\s+(.*?)\n", text)
if case_name_match:
    case_details["case_name"] = case_name_match.groups()

# case_number_match = re.search(r"\((.*?)\)", text)
case_number_match = re.search(r"\((Special Leave Petition .*?)\)", text)
if case_number_match:
    case_details["case_number"] = case_number_match.group(1)

# date_match = re.search(r"(\d{2} \w+ \d{4})", text)
date_match = re.search(r"(\b\d{1,2}\s+\w+\s+\d{4}\b)", text)


if date_match:
    case_details["date"] = date_match.group(1)

# Extract legal issues
issues_section = re.search(r"Issue for Consideration(.*?)Headnotes", text, re.DOTALL)
legal_issues = []
if issues_section:
    legal_issues = [
        issue.strip() for issue in issues_section.group(1).split("–") if issue
    ]

# Extract key findings
findings = re.findall(r"Held: (.*?)(?:–|\n\n)", text, re.DOTALL)

# Extract cited laws and case references
laws = re.findall(r"Constitution of India – ([^:]*):", text)
cases_section = re.search(
    r"Case Law Cited(.*?)(?=Books and Periodicals Cited|$)", text, re.DOTALL
)
cited_cases = (
    re.findall(r"([A-Za-z .]+ v\. [A-Za-z .]+)", cases_section.group(1))
    if cases_section
    else []
)

# Extract judgment text
# summary_match = re.search(r"Judgment(.*?)Table of Contents", text, re.DOTALL)
summary_match = re.search(
    r"Judgment(.*?)(?:Appearances|Table of Contents)", text, re.DOTALL
)

summary = summary_match.group(1).strip() if summary_match else "Not found"

# Summarization using Hugging Face with chunking
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")


def chunk_text(text, max_tokens=500):
    words = text.split()
    return [
        " ".join(words[i : i + max_tokens]) for i in range(0, len(words), max_tokens)
    ]


if len(summary) > 100:
    chunks = chunk_text(summary, max_tokens=500)
    # summarized_chunks = [summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"] for chunk in chunks]
    summarized_chunks = [
        summarizer(chunk, max_length=250, min_length=100, do_sample=False)[0][
            "summary_text"
        ]
        for chunk in chunks
    ]

    summary = " ".join(summarized_chunks)

# Organizing extracted data
extracted_data = {
    "case_details": case_details,
    "legal_issues": legal_issues,
    "key_findings": findings,
    "referenced_laws": laws,
    "cited_cases": cited_cases,
    "judgment_summary": summary,
}

# Save as JSON
output_file = "extracted_judgment.json"
with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)

print(f"Extracted data saved to {output_file}")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu
Your max_length is set to 250, but your input_length is only 169. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=84)


Extracted data saved to extracted_judgment.json
