In [9]:
!pip install transformers
!pip install frontend
!pip install streamlit
!pip install tools
!pip install optuna

Collecting frontend
  Downloading frontend-0.0.3-py3-none-any.whl.metadata (847 bytes)
Collecting itsdangerous>=1.1.0 (from frontend)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting aiofiles (from frontend)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading frontend-0.0.3-py3-none-any.whl (32 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: itsdangerous, aiofiles, frontend
Successfully installed aiofiles-24.1.0 frontend-0.0.3 itsdangerous-2.2.0


In [3]:
!pip install fitz

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting httplib2 (from fitz)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nibabel (from fitz)
  Downloading nibabel-5.3.2-py3-none-any.whl.metadata (9.1 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.9.2-py3-none-any.whl.metadata (6.8 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting importlib-resources>=5.12 (from nibabel->fitz)
  Downloading importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting pydot>=1.2.3 (from nipype->fitz)
  Downloading pydot-3.0.4-py3-none-any.whl.metadata

In [1]:
#ACCEPTANCE 1
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import optuna
import re
import os
import fitz  # PyMuPDF for handling PDF files

# ========== TEXT PROCESSING FUNCTIONS ==========

def sanitize_text(text):
    """Removes illegal characters from text."""
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

def chunk_text(text, max_length=2000):
    """Splits text into manageable chunks while preserving full sentences."""
    words = text.split()
    chunks = []
    current_chunk = ""

    for word in words:
        if len(current_chunk + " " + word) <= max_length:
            current_chunk += " " + word if current_chunk else word
        else:
            chunks.append(current_chunk.strip())
            current_chunk = word
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# ========== PDF TEXT EXTRACTION FUNCTION ==========

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# ========== MODEL LOADING FUNCTION ==========

def load_model_and_tokenizer(model_name):
    """Loads model and tokenizer efficiently."""
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )

    if torch.cuda.is_available():
        model.to("cuda")  # Move to GPU if available

    print("Model loaded successfully.")
    return model, tokenizer

# ========== ADVANCED LEGAL PROMPT GENERATOR ==========

def generate_advanced_legal_prompt(case_type):
    """Generates an optimized legal case summarization prompt based on case type."""
    
    prompts = {
        "contract_dispute": (
            "Summarize the following legal case with a focus on contractual obligations, "
            "breach details, and the legal consequences of non-compliance. "
            "Ensure that arbitration or court rulings are accurately represented. "
            "Explicitly outline whether force majeure was invoked and whether it was legally upheld. "
            "Avoid unnecessary commentary, fabricated case numbers, or unrelated legal precedents."
        ),
        "employment_law": (
            "Provide a concise and structured summary of the employment dispute, "
            "highlighting claims by both employer and employee, key evidence presented, "
            "and the final ruling. Clearly state if labor laws, wrongful termination, "
            "or workplace discrimination played a role in the judgment. "
            "Avoid unrelated legal references or fabricated citations."
        ),
        "criminal_law": (
            "Summarize the criminal case by identifying the charges, legal arguments from both prosecution and defense, "
            "and the final court ruling. Clearly differentiate between factual evidence, witness testimonies, "
            "and applicable laws referenced during the proceedings. "
            "Do not fabricate legal citations or case numbers."
        ),
        "intellectual_property": (
            "Summarize the legal case focusing on intellectual property rights, infringement claims, "
            "and legal reasoning behind the court’s decision. Explicitly mention whether fair use, "
            "patent validity, or copyright law principles were applied. "
            "Ensure clarity without adding fabricated legal precedents or citations."
        ),
    }

    # Default fallback prompt for unlisted case types
    default_prompt = (
        "Summarize the legal case with a structured approach, ensuring clarity, factual accuracy, "
        "and legal precision. Identify key issues, claims from both parties, and the final ruling. "
        "Ensure the summary remains strictly within the context of the case without adding fabricated case numbers, "
        "unrelated legal precedents, or speculative conclusions."
    )

    return prompts.get(case_type.lower(), default_prompt)

# ========== SUMMARY CLEANING FUNCTION ==========

def clean_summary(summary):
    """Ensures the generated summary ends naturally without unnecessary repetitions."""
    summary = re.sub(r'### End-Note:.*', '', summary, flags=re.DOTALL).strip()
    summary = re.sub(r'### Response:.*', '', summary, flags=re.DOTALL).strip()

    # Limit excessive conclusions
    sentences = summary.split(". ")
    if len(sentences) > 4:  # Limit the length of the summary
        summary = ". ".join(sentences[:4]) + "."

    return summary

# ========== SUMMARIZATION FUNCTION ==========

def generate_summary(input_text, model, tokenizer, trial, case_type):
    """Generates a structured summary with optimized hyperparameters."""
    prompt = generate_advanced_legal_prompt(case_type)
    input_str = f"{prompt}\n\n### Document:\n{input_text[:4096]}\n\n### Summary:"

    # Dynamic Hyperparameter Tuning
    max_new_tokens = trial.suggest_int("max_new_tokens", 200, 300)
    no_repeat_ngram_size = trial.suggest_int("no_repeat_ngram_size", 3, 4)
    num_beams = trial.suggest_int("num_beams", 5, 6)
    temperature = trial.suggest_float("temperature", 0.1, 0.25)
    top_p = trial.suggest_float("top_p", 0.75, 0.85)
    top_k = trial.suggest_int("top_k", 40, 60)
    do_sample = False  # Force deterministic output

    model_inputs = tokenizer(
        input_str,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=4096
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    summary_output = model.generate(
        model_inputs.input_ids,
        max_new_tokens=max_new_tokens,
        no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        do_sample=do_sample
    )

    full_output = tokenizer.decode(summary_output[0], skip_special_tokens=True)
    marker = "### Summary:"
    summary = full_output.split(marker, 1)[1].strip() if marker in full_output else full_output.strip()

    return sanitize_text(clean_summary(summary))

# ========== PDF PROCESSING FUNCTION ==========

def process_pdf_folder(folder_path, case_type, model, tokenizer, trial):
    """Process all PDF files in a folder and generate summaries."""
    results = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(file_path)
            sanitized_text = sanitize_text(text)
            chunks = chunk_text(sanitized_text)
            summaries = [generate_summary(chunk, model, tokenizer, trial, case_type) for chunk in chunks]
            final_summary = " ".join(summaries)
            results[filename] = final_summary
    return results

# ========== OPTUNA OBJECTIVE FUNCTION ==========

def objective(trial, folder_path, case_type, model, tokenizer):
    """Objective function for hyperparameter tuning with Optuna."""
    results = process_pdf_folder(folder_path, case_type, model, tokenizer, trial)
    total_length = sum(len(summary) for summary in results.values())
    return total_length  # You can modify this to use a more meaningful metric

# ========== MAIN SCRIPT EXECUTION ==========

if __name__ == "__main__":
    # Load model and tokenizer
    model_name = "coderop12/Empowering_Legal_Summarization"
    model, tokenizer = load_model_and_tokenizer(model_name)

    # Specify the folder containing PDF files and the case type
    folder_path = input("Enter the path to the directory containing the PDF files: ")
    case_type = input("Enter the case type (e.g., contract_dispute, employment_law, criminal_law, intellectual_property): ")

    # Optimize parameters using Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, folder_path, case_type, model, tokenizer), n_trials=10)

    best_trial = study.best_trial
    best_params = best_trial.params
    print(f"\nBest Hyperparameters: {best_params}\n")

    # Process the PDF folder and generate summaries using the best parameters
    results = process_pdf_folder(folder_path, case_type, model, tokenizer, best_trial)

    # Print the summaries
    for file, summary in results.items():
        print(f"Summary for {file}:")
        print(summary)
        print("-" * 80)

Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.


[I 2025-03-02 07:48:06,086] A new study created in memory with name: no-name-1d279568-574f-41dc-9e4f-9dbd3f5774be
[I 2025-03-02 07:48:37,437] Trial 0 finished with value: 1102.0 and parameters: {'max_new_tokens': 264, 'no_repeat_ngram_size': 3, 'num_beams': 5, 'temperature': 0.196242444289175, 'top_p': 0.7888446935379861, 'top_k': 48}. Best is trial 0 with value: 1102.0.
[I 2025-03-02 07:49:09,800] Trial 1 finished with value: 1048.0 and parameters: {'max_new_tokens': 277, 'no_repeat_ngram_size': 4, 'num_beams': 5, 'temperature': 0.11673044047887313, 'top_p': 0.7673773298299861, 'top_k': 44}. Best is trial 0 with value: 1102.0.
[I 2025-03-02 07:49:36,614] Trial 2 finished with value: 907.0 and parameters: {'max_new_tokens': 214, 'no_repeat_ngram_size': 4, 'num_beams': 6, 'temperature': 0.23575516968631188, 'top_p': 0.7737967504455827, 'top_k': 58}. Best is trial 0 with value: 1102.0.
[I 2025-03-02 07:50:07,756] Trial 3 finished with value: 1102.0 and parameters: {'max_new_tokens': 268,


Best Hyperparameters: {'max_new_tokens': 264, 'no_repeat_ngram_size': 3, 'num_beams': 5, 'temperature': 0.196242444289175, 'top_p': 0.7888446935379861, 'top_k': 48}

Summary for Criminal_Case_File_John_Doe.pdf:
The defendant was charged with first-degree murder, aggravated assault, and burglary in connection with the death of Emily White. The prosecution argued that the defendant had a motive to kill the victim and that he had the opportunity to commit the crime. The defense argued that there was no evidence linking the defendant to the crime and that the prosecution had failed to prove its case beyond a reasonable doubt. The court found that the evidence was sufficient to support a finding of guilt. The State is seeking a sentence of life in prison without the possibility of parole for the defendant, John Doe. The defendant is charged with one count of murder in the first degree, one count conspiracy to commit murder, and one count possession of a firearm by a person who has previous

In [3]:
!pip install FPDF

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting FPDF
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: FPDF
  Building wheel for FPDF (setup.py) ... [?25ldone
[?25h  Created wheel for FPDF: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40758 sha256=fcee241ff767c723af9965305c3bc89d300fa1772b8536485f2fc268038b60a9
  Stored in directory: /home/zeus/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built FPDF
Installing collected packages: FPDF
Successfully installed FPDF-1.7.2


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import optuna
import re
import os
import fitz  # PyMuPDF for PDF processing
from rouge_score import rouge_scorer

# ========== TEXT PROCESSING FUNCTIONS ==========

def sanitize_text(text):
    """Removes illegal characters from text."""
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

def chunk_text(text, max_length=3000):
    """Splits long text into smaller, structured chunks."""
    words = text.split()
    chunks = []
    current_chunk = ""

    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_length:
            current_chunk += " " + word if current_chunk else word
        else:
            chunks.append(current_chunk.strip())
            current_chunk = word
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# ========== PDF TEXT EXTRACTION FUNCTION ==========

def extract_text_from_pdf(pdf_path):
    """Extracts and processes text from a PDF file."""
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' does not exist.")
        return ""

    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        doc.close()
        return sanitize_text(text.strip())
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# ========== MODEL LOADING FUNCTION ==========

def load_model_and_tokenizer(model_name):
    """Loads tokenizer and model efficiently."""
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )

    if torch.cuda.is_available():
        model.to("cuda")  # Move to GPU if available

    print("Model loaded successfully.")
    return model, tokenizer

# ========== ADVANCED LEGAL PROMPT GENERATOR ==========

def generate_advanced_legal_prompt(case_type):
    """Generates structured prompts with few-shot examples for better accuracy."""
    
    prompts = {
        "criminal_law": """
You are an expert legal assistant summarizing criminal cases.
Create a detailed summary with the following structure:
1. Defendant details and charges
2. Evidence collected
3. Key prosecution arguments
4. Key defense arguments
5. Current case status

Focus on key elements that determine the case outcome.
""",
        "contract_dispute": """
You are an expert legal assistant summarizing contract disputes.
Create a detailed summary with the following structure:
1. Parties involved
2. Nature of the contract and alleged breach
3. Key legal arguments from plaintiff
4. Key legal arguments from defendant
5. Case status or resolution

Focus on key elements that determine the case outcome.
""",
        "employment_law": """
You are an expert legal assistant summarizing employment law cases.
Create a structured summary with:
1. Employee/employer details
2. Nature of the dispute
3. Key claims by employee
4. Key defense by employer
5. Case status or ruling

Focus on key elements that determine the case outcome.
""",
        "intellectual_property": """
You are an expert legal assistant summarizing intellectual property cases.
Create a structured summary with:
1. Parties involved
2. Nature of alleged infringement
3. Key arguments from plaintiff
4. Key arguments from defendant
5. Case status or ruling

Focus on key elements that determine the case outcome.
""",
        "default_prompt": """
Summarize the legal case with clarity:
1. Key parties involved
2. Nature of the dispute
3. Main arguments from both sides
4. Current status or resolution

Focus on key elements that determine the case outcome.
"""
    }

    return prompts.get(case_type.lower(), prompts["default_prompt"])

# ========== EVALUATION FUNCTION ==========

def evaluate_summary(generated_summary, reference_text):
    """
    Evaluates the quality of the generated summary.
    Returns a score that can be used for optimization.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text[:1000], generated_summary)
    
    combined_score = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3.0
    
    # Additional heuristics for summary quality
    words = generated_summary.split()
    unique_words = set(words)
    
    # 1. Prefer summaries of reasonable length
    length_score = min(1.0, len(words) / 350)
    
    # 2. Penalize repetitions
    repetition_score = len(unique_words) / max(1, len(words))
    
    # 3. Reward information density
    info_density = min(1.0, len(unique_words) / 200)
    
    # Final weighted score
    final_score = (combined_score * 0.5) + (length_score * 0.2) + (repetition_score * 0.2) + (info_density * 0.1)
    
    return final_score

# ========== SUMMARIZATION FUNCTION ==========

def generate_summary(input_text, model, tokenizer, hyperparams, case_type):
    """Generates structured summary with specified hyperparameters."""
    prompt = generate_advanced_legal_prompt(case_type)
    input_str = f"{prompt}\n\n### Document:\n{input_text[:4096]}\n\n### Summary:\n"

    model_inputs = tokenizer(input_str, return_tensors="pt", padding=True, truncation=True, max_length=4096).to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=hyperparams.get("max_new_tokens", 300),
            num_beams=hyperparams.get("num_beams", 4),
            temperature=hyperparams.get("temperature", 0.2),
            do_sample=hyperparams.get("do_sample", True),
            top_p=hyperparams.get("top_p", 0.9),
            no_repeat_ngram_size=3
        )

    full_output = tokenizer.decode(summary_output[0], skip_special_tokens=True)
    
    return sanitize_text(full_output.split("### Summary:")[-1].strip())

# ========== OBJECTIVE FUNCTION FOR OPTUNA ==========

def objective(trial, input_text, model, tokenizer, case_type, reference_text):
    """Objective function for Optuna optimization."""
    summary = generate_summary(input_text, model, tokenizer, {
        "max_new_tokens": trial.suggest_int("max_new_tokens", 250, 350),
        "num_beams": trial.suggest_int("num_beams", 3, 5),
        "temperature": trial.suggest_float("temperature", 0.1, 0.3),
        "do_sample": trial.suggest_categorical("do_sample", [True]),
        "top_p": trial.suggest_float("top_p", 0.85, 0.95)
    }, case_type)
    
    return evaluate_summary(summary, reference_text)

# ========== MAIN EXECUTION BLOCK ==========

if __name__ == "__main__":
    model_name = "coderop12/Empowering_Legal_Summarization"
    model, tokenizer = load_model_and_tokenizer(model_name)

    pdf_path = input("Enter the path to the PDF file: ")
    input_text = extract_text_from_pdf(pdf_path)

    if input_text:
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: objective(trial, input_text, model, tokenizer, "default_prompt", input_text), n_trials=5)

        print("\n=== FINAL SUMMARY ===\n")
        print(generate_summary(input_text, model, tokenizer, study.best_trial.params, "default_prompt"))
    else:
        print("No text extracted from the PDF. Please check the file path.")


Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.


[I 2025-03-02 08:37:45,705] A new study created in memory with name: no-name-f3de6887-3ff1-40c8-8c3f-aeeec486afdd
[I 2025-03-02 08:38:11,361] Trial 0 finished with value: 0.4417839010366038 and parameters: {'max_new_tokens': 343, 'num_beams': 5, 'temperature': 0.1837575119456632, 'do_sample': True, 'top_p': 0.8518569772873625}. Best is trial 0 with value: 0.4417839010366038.
[I 2025-03-02 08:38:30,874] Trial 1 finished with value: 0.4018971765938979 and parameters: {'max_new_tokens': 304, 'num_beams': 4, 'temperature': 0.19608899091576665, 'do_sample': True, 'top_p': 0.9169928763336238}. Best is trial 0 with value: 0.4417839010366038.
[I 2025-03-02 08:38:50,459] Trial 2 finished with value: 0.39706987248700076 and parameters: {'max_new_tokens': 303, 'num_beams': 4, 'temperature': 0.1339902249714613, 'do_sample': True, 'top_p': 0.9466764884383407}. Best is trial 0 with value: 0.4417839010366038.
[I 2025-03-02 08:39:15,147] Trial 3 finished with value: 0.44186194967225995 and parameters:


=== FINAL SUMMARY ===

John Doe is charged with first-degree murder, aggravated assault, and burglary in connection with the death of Emily White, a high school teacher in Metropolis. The prosecution intends to prove that John Doe killed Emily White by stabbing her multiple times in her own apartment. The defense will argue that there is no evidence linking John Doe to the crime and that he is innocent of all charges. The facts of the case are as follows: On the night of the incident, the victim was found dead by her neighbor. The police were called to the scene and found that the victim had been stabbed multiple times. The victim was taken to the local hospital, where she was pronounced dead on arrival. The investigation revealed that the defendant had been in a relationship with Emily White for several months prior to the incident. The two had a tumultuous relationship and had argued on several occasions. On the day of the crime, the defendant called the victim and asked her to meet

In [None]:
'''Parameter	Previous Tokens	Present Tokens	Impact
max_new_tokens	250 - 350	100 - 200	Shorter, more concise summaries
num_beams	3 - 5	4 - 6	Higher quality, fewer errors
top_p	0.75 - 0.85	0.8 - 0.95	Better token selection for clarity
do_sample	False	True	More diverse yet accurate text
'''
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import optuna
import re
import os
import fitz  # PyMuPDF for PDF processing

# ========== TEXT PROCESSING FUNCTIONS ==========

def sanitize_text(text):
    """Removes illegal characters from text."""
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

def extract_text_from_pdf(pdf_path):
    """Extracts and processes text from a PDF file."""
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' does not exist.")
        return ""

    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        doc.close()
        return sanitize_text(text.strip())
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# ========== MODEL LOADING FUNCTION ==========

def load_model_and_tokenizer(model_name):
    """Loads tokenizer and model efficiently."""
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )

    if torch.cuda.is_available():
        model.to("cuda")  # Move to GPU if available

    print("Model loaded successfully.")
    return model, tokenizer

# ========== LEGAL PROMPT GENERATOR ==========

def generate_advanced_legal_prompt(case_type):
    """Generates structured prompts for concise summarization."""
    
    prompts = {
        "criminal_law": """
Summarize this criminal case concisely:
1. Defendant details and charges
2. Key evidence presented
3. Prosecution's main arguments
4. Defense counterarguments
5. Case status
""",
        "contract_dispute": """
Summarize this contract dispute case concisely:
1. Parties involved
2. Nature of the contract and alleged breach
3. Key legal arguments from both sides
4. Case status or resolution
""",
        "employment_law": """
Summarize this employment law case concisely:
1. Employee/employer details
2. Nature of the dispute
3. Key claims by the employee
4. Key defenses by the employer
5. Case status
""",
        "intellectual_property": """
Summarize this intellectual property case concisely:
1. Parties involved
2. Nature of alleged infringement
3. Key legal arguments from both sides
4. Case status or ruling
""",
        "default_prompt": """
Summarize this legal case concisely:
1. Key parties involved
2. Nature of the dispute
3. Main arguments from both sides
4. Current status or resolution
"""
    }

    return prompts.get(case_type.lower(), prompts["default_prompt"])

# ========== SUMMARIZATION FUNCTION ==========

def generate_concise_summary(input_text, model, tokenizer, hyperparams, case_type):
    """Generates a structured, concise summary with optimized hyperparameters."""
    prompt = generate_advanced_legal_prompt(case_type)
    input_str = f"{prompt}\n\n### Document:\n{input_text[:4096]}\n\n### Summary:\n"

    model_inputs = tokenizer(input_str, return_tensors="pt", padding=True, truncation=True, max_length=4096).to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=hyperparams.get("max_new_tokens", 150),  # Enforced conciseness
            num_beams=hyperparams.get("num_beams", 5),  
            temperature=hyperparams.get("temperature", 0.2),
            do_sample=hyperparams.get("do_sample", True),
            top_p=hyperparams.get("top_p", 0.85),
            no_repeat_ngram_size=3
        )

    full_output = tokenizer.decode(summary_output[0], skip_special_tokens=True)
    
    return sanitize_text(full_output.split("### Summary:")[-1].strip())

# ========== OBJECTIVE FUNCTION FOR OPTUNA ==========

def objective(trial, input_text, model, tokenizer, case_type):
    """Objective function for hyperparameter tuning using Optuna."""
    summary = generate_concise_summary(input_text, model, tokenizer, {
        "max_new_tokens": trial.suggest_int("max_new_tokens", 100, 200),  # Concise summary
        "num_beams": trial.suggest_int("num_beams", 4, 6),
        "temperature": trial.suggest_float("temperature", 0.1, 0.3),
        "do_sample": trial.suggest_categorical("do_sample", [True]),
        "top_p": trial.suggest_float("top_p", 0.8, 0.95)
    }, case_type)

    return len(summary)  # Placeholder scoring (use evaluation metric if needed)

# ========== MAIN EXECUTION BLOCK ==========

if __name__ == "__main__":
    model_name = "coderop12/Empowering_Legal_Summarization"
    model, tokenizer = load_model_and_tokenizer(model_name)

    pdf_path = input("Enter the path to the PDF file: ")
    input_text = extract_text_from_pdf(pdf_path)

    if input_text:
        study = optuna.create_study(direction="maximize")
        print("Optimizing summarization parameters with 15 trials for conciseness...")
        study.optimize(lambda trial: objective(trial, input_text, model, tokenizer, "default_prompt"), n_trials=15)  

        print("\n=== FINAL CONCISE SUMMARY ===\n")
        concise_summary = generate_concise_summary(input_text, model, tokenizer, study.best_trial.params, "default_prompt")
        print(concise_summary)
    else:
        print("No text extracted from the PDF. Please check the file path.")
#In this code  i forgotten to keep prompt parameter in input
but its good enough


Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.


[I 2025-03-02 08:46:44,740] A new study created in memory with name: no-name-713185b6-e8a7-4ec7-b596-d2235d22a9b8


Optimizing summarization parameters with 15 trials for conciseness...


[I 2025-03-02 08:46:57,788] Trial 0 finished with value: 970.0 and parameters: {'max_new_tokens': 188, 'num_beams': 5, 'temperature': 0.1548510009339974, 'do_sample': True, 'top_p': 0.8073313972911808}. Best is trial 0 with value: 970.0.
[I 2025-03-02 08:47:08,955] Trial 1 finished with value: 836.0 and parameters: {'max_new_tokens': 158, 'num_beams': 5, 'temperature': 0.2718870897849227, 'do_sample': True, 'top_p': 0.8650483669703447}. Best is trial 0 with value: 970.0.
[I 2025-03-02 08:47:21,370] Trial 2 finished with value: 823.0 and parameters: {'max_new_tokens': 158, 'num_beams': 6, 'temperature': 0.16931874656943355, 'do_sample': True, 'top_p': 0.8156332015587646}. Best is trial 0 with value: 970.0.
[I 2025-03-02 08:47:36,257] Trial 3 finished with value: 978.0 and parameters: {'max_new_tokens': 190, 'num_beams': 6, 'temperature': 0.12826875231479046, 'do_sample': True, 'top_p': 0.9473973229061159}. Best is trial 3 with value: 978.0.
[I 2025-03-02 08:47:48,986] Trial 4 finished w


=== FINAL CONCISE SUMMARY ===

The defendant was charged with first-degree murder, aggravated assault, and burglary in connection with the death of Emily White. The prosecution argued that the defendant had a motive to kill the victim and that he had the opportunity to commit the crime. The defense argued that there was no evidence linking the defendant to the crime and that the prosecution had not proven its case beyond a reasonable doubt. The case is still pending and the defendant is presumed innocent until proven guilty in a court of law. The facts of the case are as follows: The defendant was a freelance graphic designer and the victim was a high school teacher. They had been dating for a few months before the incident. On the night of the incident, the defendant called the victim several times but she did not answer. He then went to her apartment and forced his way in. The victim was found dead in her bed with multiple stab wounds to her neck and chest. The police were called to

In [1]:
#final_run
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import optuna
import re
import os
import fitz  # PyMuPDF for PDF processing

# ========== TEXT PROCESSING FUNCTIONS ==========

def sanitize_text(text):
    """Removes illegal characters from text."""
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

def extract_text_from_pdf(pdf_path):
    """Extracts and processes text from a PDF file."""
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' does not exist.")
        return ""

    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n"
        doc.close()
        return sanitize_text(text.strip())
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# ========== MODEL LOADING FUNCTION ==========

def load_model_and_tokenizer(model_name):
    """Loads tokenizer and model efficiently."""
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )

    if torch.cuda.is_available():
        model.to("cuda")  # Move to GPU if available

    print("Model loaded successfully.")
    return model, tokenizer

# ========== LEGAL PROMPT SELECTION ==========

def generate_advanced_legal_prompt(case_type):
    """Generates structured prompts for concise summarization."""
    
    prompts = {
        "contract_dispute": """
Summarize this contract dispute concisely:
1. Parties involved
2. Nature of the contract and alleged breach
3. Key legal arguments from both sides
4. Case status or resolution
""",
        "employment_law": """
Summarize this employment law case concisely:
1. Employee/employer details
2. Nature of the dispute
3. Key claims by the employee
4. Key defenses by the employer
5. Case status
""",
        "criminal_law": """
Summarize this criminal case concisely:
1. Defendant details and charges
2. Key evidence presented
3. Prosecution's main arguments
4. Defense counterarguments
5. Case status
""",
        "intellectual_property": """
Summarize this intellectual property case concisely:
1. Parties involved
2. Nature of alleged infringement
3. Key legal arguments from both sides
4. Case status or ruling
""",
        "default_prompt": """
Summarize this legal case concisely:
1. Key parties involved
2. Nature of the dispute
3. Main arguments from both sides
4. Current status or resolution
"""
    }

    return prompts.get(case_type.lower(), prompts["default_prompt"])

# ========== SUMMARIZATION FUNCTION ==========

def generate_concise_summary(input_text, model, tokenizer, hyperparams, case_type):
    """Generates a structured, concise summary with optimized hyperparameters."""
    prompt = generate_advanced_legal_prompt(case_type)
    input_str = f"{prompt}\n\n### Document:\n{input_text[:4096]}\n\n### Summary:\n"

    model_inputs = tokenizer(input_str, return_tensors="pt", padding=True, truncation=True, max_length=4096).to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=hyperparams.get("max_new_tokens", 150),  # Enforced conciseness
            num_beams=hyperparams.get("num_beams", 5),  
            temperature=hyperparams.get("temperature", 0.2),
            do_sample=hyperparams.get("do_sample", True),
            top_p=hyperparams.get("top_p", 0.85),
            no_repeat_ngram_size=3
        )

    full_output = tokenizer.decode(summary_output[0], skip_special_tokens=True)
    
    return sanitize_text(full_output.split("### Summary:")[-1].strip())

# ========== OBJECTIVE FUNCTION FOR OPTUNA ==========

def objective(trial, input_text, model, tokenizer, case_type):
    """Objective function for hyperparameter tuning using Optuna."""
    summary = generate_concise_summary(input_text, model, tokenizer, {
        "max_new_tokens": trial.suggest_int("max_new_tokens", 100, 200),  # Concise summary
        "num_beams": trial.suggest_int("num_beams", 4, 6),
        "temperature": trial.suggest_float("temperature", 0.1, 0.3),
        "do_sample": trial.suggest_categorical("do_sample", [True]),
        "top_p": trial.suggest_float("top_p", 0.8, 0.95)
    }, case_type)

    return len(summary)  # Placeholder scoring (use evaluation metric if needed)

# ========== MAIN EXECUTION BLOCK ==========

if __name__ == "__main__":
    model_name = "coderop12/Empowering_Legal_Summarization"
    model, tokenizer = load_model_and_tokenizer(model_name)

    # Case type selection
    case_types = {
        1: "contract_dispute",
        2: "employment_law",
        3: "criminal_law",
        4: "intellectual_property"
    }

    print("Available summarization types:")
    for key, value in case_types.items():
        print(f"{key}: {value.replace('_', ' ').title()}")

    choice = int(input("Enter the number corresponding to the type of summarization you want: "))
    case_type = case_types.get(choice, "default_prompt")

    # PDF File Input
    pdf_path = input("Enter the path to the PDF file: ")
    input_text = extract_text_from_pdf(pdf_path)

    if input_text:
        study = optuna.create_study(direction="maximize")
        print("Optimizing summarization parameters with 15 trials for conciseness...")
        study.optimize(lambda trial: objective(trial, input_text, model, tokenizer, case_type), n_trials=15)  

        print("\n=== FINAL CONCISE SUMMARY ===\n")
        concise_summary = generate_concise_summary(input_text, model, tokenizer, study.best_trial.params, case_type)
        print(concise_summary)
    else:
        print("No text extracted from the PDF. Please check the file path.")


Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.
Available summarization types:
1: Contract Dispute
2: Employment Law
3: Criminal Law
4: Intellectual Property


[I 2025-03-02 08:54:18,695] A new study created in memory with name: no-name-54f36d3b-17e6-4634-abf6-8400151192a2


Optimizing summarization parameters with 15 trials for conciseness...


[I 2025-03-02 08:54:30,219] Trial 0 finished with value: 846.0 and parameters: {'max_new_tokens': 164, 'num_beams': 4, 'temperature': 0.10449297595447743, 'do_sample': True, 'top_p': 0.8187036864928329}. Best is trial 0 with value: 846.0.
[I 2025-03-02 08:54:42,160] Trial 1 finished with value: 911.0 and parameters: {'max_new_tokens': 177, 'num_beams': 4, 'temperature': 0.18076788567526092, 'do_sample': True, 'top_p': 0.8668709012360236}. Best is trial 1 with value: 911.0.
[I 2025-03-02 08:54:58,481] Trial 2 finished with value: 1025.0 and parameters: {'max_new_tokens': 200, 'num_beams': 6, 'temperature': 0.20449127588870547, 'do_sample': True, 'top_p': 0.8034672831956983}. Best is trial 2 with value: 1025.0.
[I 2025-03-02 08:55:13,953] Trial 3 finished with value: 1003.0 and parameters: {'max_new_tokens': 195, 'num_beams': 6, 'temperature': 0.2522550576414827, 'do_sample': True, 'top_p': 0.8406816741340758}. Best is trial 2 with value: 1025.0.
[I 2025-03-02 08:55:26,075] Trial 4 finis


=== FINAL CONCISE SUMMARY ===

The defendant was charged with first-degree murder, aggravated assault, and burglary. The prosecution argued that the defendant was the perpetrator of the crime. The defense argued that there was no evidence linking the defendant to the crime and that the evidence was unreliable. The court found that the prosecution had failed to prove its case beyond a reasonable doubt and therefore acquitted the defendant of all charges. The case was dismissed. The facts of the case are as follows: The defendant was a freelance graphic designer and the victim was a high school teacher. They had been in a relationship for two years and had a child together. On the night of the incident, the defendant called the victim and asked her to meet him at his apartment. The victim arrived at the apartment and was met by the defendant, who was armed with a knife. He stabbed her multiple times and then fled the scene. The police were called to the scene and found the victim lying 

In [7]:
pip install rouge-score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [1]:
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Define reference and generated summaries
reference_summary = """
The defendant was charged with first-degree murder, aggravated assault, and burglary. 
The prosecution alleged that the defendant attacked the victim, his former partner, with a knife in his apartment. 
The defense argued that no concrete evidence linked him to the crime, and the court ruled in favor of the defense, citing reasonable doubt.

Case Details:
- Defendant: Freelance graphic designer, former partner of the victim.
- Victim: High school teacher, in a relationship with the defendant.
- Incident Summary: On the night of the crime, the defendant called the victim to his apartment, where she was fatally stabbed.
- Key Evidence: The prosecution presented circumstantial evidence, but the defense challenged its reliability.
- Court Ruling: The court acquitted the defendant due to insufficient proof beyond a reasonable doubt.

The case was dismissed.
"""

generated_summary = """
The defendant was charged with first-degree murder, aggravated assault, and burglary. 
The prosecution argued that the defendant was the perpetrator of the crime. 
The defense argued that there was no evidence linking the defendant to the crime and that the evidence was unreliable. 
The court found that the prosecution had failed to prove its case beyond a reasonable doubt and therefore acquitted the defendant of all charges. 
The case was dismissed. The facts of the case are as follows: The defendant was a freelance graphic designer and the victim was a high school teacher. 
They had been in a relationship for two years and had a child together. On the night of the incident, the defendant called the victim and asked her to meet him at his apartment. 
The victim arrived at the apartment and was met by the defendant, who was armed with a knife. He stabbed her multiple times and then fled the scene. 
The police were called to the scene and found the victim lying on the floor of the apartment. She was bleeding profusely and was pronounced dead.
"""

# Function to compute ROUGE scores
def compute_rouge_scores(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    
    return {
        "ROUGE-1": round(scores['rouge1'].fmeasure, 4),
        "ROUGE-2": round(scores['rouge2'].fmeasure, 4),
        "ROUGE-L": round(scores['rougeL'].fmeasure, 4)
    }

# Function to compute Cosine Similarity
def compute_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return round(cosine_similarity(tfidf_matrix)[0, 1], 4)

# Compute ROUGE scores
rouge_scores = compute_rouge_scores(reference_summary, generated_summary)

# Compute Cosine Similarity
cosine_score = compute_cosine_similarity(reference_summary, generated_summary)

# Combine results
evaluation_results = {**rouge_scores, "Cosine Similarity": cosine_score}

# Display results
df = pd.DataFrame(evaluation_results.items(), columns=["Metric", "Score"])
print("\n=== SUMMARY EVALUATION RESULTS ===")
print(df)



=== SUMMARY EVALUATION RESULTS ===
              Metric   Score
0            ROUGE-1  0.5916
1            ROUGE-2  0.3819
2            ROUGE-L  0.4373
3  Cosine Similarity  0.8449
