!pip install torch
!pip install bitsandbytes
!pip install datasets==2.13.1
!pip install scipy
!pip install git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/lvwerra/trl.git


In [11]:
!pip install torch
!pip install bitsandbytes
!pip install datasets==2.13.1
!pip install scipy
!pip install git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/lvwerra/trl.git


Collecting datasets==2.13.1
  Using cached datasets-2.13.1-py3-none-any.whl.metadata (20 kB)
Using cached datasets-2.13.1-py3-none-any.whl (486 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.3.2
    Uninstalling datasets-3.3.2:
      Successfully uninstalled datasets-3.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
trl 0.16.0.dev0 requires datasets>=2.21.0, but you have datasets 2.13.1 which is incompatible.[0m[31m
[0mSuccessfully installed datasets-2.13.1
Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-req-build-j2ml5ko1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-req-build-j2ml5ko1

  Resolved https://github.com/huggingface/accel

In [12]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig,DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

In [13]:
seed=42
set_seed(seed)

In [14]:
!pip install bitsandbytes



In [15]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [16]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="output_cleaned_fixed.jsonl",split="train")


In [17]:
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

Number of prompts: 1898
Column names are: ['instruction', 'context', 'response', 'category']


In [18]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}\n{sample['response']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)

    sample["text"] = formatted_prompt

    return sample

In [19]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length =8192
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [20]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [21]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [22]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [23]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
import os
os.environ["HF_TOKEN"] = "Replace wih  your token"


In [25]:
from huggingface_hub import login
login(token=os.getenv("HF_TOKEN"))


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
model_name = "google/gemma-2-2b-it"

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
max_length = get_max_length(model)

dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

Found max lenth: 8192
Preprocessing dataset...


In [18]:
import torch

print(f"Allocated Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached Memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


Allocated Memory: 2.07 GB
Cached Memory: 4.90 GB


In [19]:
import os
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, get_peft_model

def train(model, tokenizer, dataset, output_dir):
    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare model for 8-bit training with PEFT
    model = prepare_model_for_kbit_training(model)

    # Find all linear module names (LoRA layers)
    modules = find_all_linear_names(model)

    # Create PEFT config and wrap the model
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)

    # Print percentage of trainable parameters
    print_trainable_parameters(model)

    # Setup training parameters with optimized settings
    trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,  # Reduce batch size
        gradient_accumulation_steps=16,  # Higher accumulation to compensate
        bf16=True,  # Use bf16 instead of fp16 if supported
        warmup_steps=5,
        max_steps=500,  
        logging_steps=10,  
        output_dir=output_dir,
        optim="paged_adamw_8bit",  # More memory-efficient optimizer
        save_strategy="epoch",  # Save only per epoch
        save_total_limit=1,  # Keep only last checkpoint
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)


    # Disable caching to ensure training performance isn't hindered
    model.config.use_cache = False

    # Verify and print data types of model parameters before training
    dtypes = {}
    for name, param in model.named_parameters():
        dtype = param.dtype
        if dtype not in dtypes:
            dtypes[dtype] = 0
        dtypes[dtype] += param.numel()
    total = sum(dtypes.values())
    for dtype, count in dtypes.items():
        print(f"{dtype}: {count} parameters, {count / total:.2%} of total")

    # Launch training if conditions are met
    print("Training...")
    train_result = trainer.train()
    metrics = train_result.metrics

    # Log and save metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    # Print final training metrics
    print(metrics)

    # Save the final model checkpoint
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free up CUDA memory
    del model, trainer
    torch.cuda.empty_cache()

# Define output directory for saving the model
output_dir = "resultstEST_2.0/llama2/final_checkpoint"
# Execute the training function
train(model, tokenizer, dataset, output_dir)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


all params: 1,622,970,624 || trainable params: 20,766,720 || trainable%: 1.279549961835908
torch.float32: 610832640 parameters, 37.64% of total
torch.uint8: 1012137984 parameters, 62.36% of total
Training...


It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
10,1.4469
20,1.3294
30,1.2829
40,1.257
50,1.2311
60,1.1995
70,1.1827
80,1.1824
90,1.1489
100,1.1646




***** train metrics *****
  epoch                    =      4.2025
  total_flos               = 185584124GF
  train_loss               =      1.0807
  train_runtime            =  1:56:53.94
  train_samples_per_second =       1.141
  train_steps_per_second   =       0.071
{'train_runtime': 7013.9411, 'train_samples_per_second': 1.141, 'train_steps_per_second': 0.071, 'total_flos': 1.9926943684972646e+17, 'train_loss': 1.0807353668212891, 'epoch': 4.2025316455696204}
Saving last checkpoint of the model...


In [20]:
output_dir = "results/gemma2b/legal_summarization"

In [24]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Correct path where the fine-tuned model is stored
output_dir = "/teamspace/studios/this_studio/resultstEST_2.0/llama2/final_checkpoint"

# Load fine-tuned PEFT model with LoRA adapters
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype="auto")

# Merge LoRA into the base model (this removes the adapter)
model = model.merge_and_unload()

# Define the path to save the fully merged model
merged_output_dir = "/teamspace/studios/this_studio/results/final"
model.save_pretrained(merged_output_dir, safe_serialization=True)

# Save tokenizer along with the merged model
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
tokenizer.save_pretrained(merged_output_dir)

print("✅ Model successfully merged and saved at:", merged_output_dir)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model successfully merged and saved at: /teamspace/studios/this_studio/results/final


In [22]:
'''model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = "results/llama2/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)
'''

ValueError: Can't find 'adapter_config.json' at 'results/gemma2b/legal_summarization'

In [26]:
from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()

# Create a new repo (replace with your desired repo name)
repo_name = "Empowering_Legal_Summarization"  # Change this to your desired repo name
api.create_repo(repo_name, exist_ok=True)


RepoUrl('https://huggingface.co/coderop12/Empowering_Legal_Summarization', endpoint='https://huggingface.co', repo_type='model', repo_id='coderop12/Empowering_Legal_Summarization')

In [27]:
# 1. Make sure you have an up-to-date huggingface_hub:
#    pip install --upgrade huggingface_hub

from huggingface_hub import login, upload_folder

# Log in to Hugging Face
login(token="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")  # Replace with your API token

# Set the repository name and username
repo_name = "Empowering_Legal_Summarization"  # The repo must already exist on your Hugging Face account
repo_id = f"coderop12/{repo_name}"

# Local folder you want to upload
local_folder_path = r"/teamspace/studios/this_studio/results/final"

# Upload the entire folder to the root of the repo
upload_folder(
    folder_path=local_folder_path,
    repo_id=repo_id,
    commit_message="Uploading the entire final_merged_checkpoint folder"
)

print("Folder uploaded successfully!")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

Folder uploaded successfully!


In [28]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [30]:
import os
import PyPDF2
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Completely disable Torch Dynamo to prevent recompile errors
import torch._dynamo
torch._dynamo.config.suppress_errors = True  # Prevent crashing
torch._dynamo.config.cache_size_limit = 0  # Disable cache
torch._dynamo.config.verbose = False  # Reduce logging

# ✅ Disable PyTorch compilation for models
os.environ["TORCH_COMPILE"] = "0"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_INDUCTOR_DISABLED"] = "1"

# Function to sanitize text by removing illegal characters
def sanitize_text(text):
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

# Function to load the model and tokenizer and move the model to GPU
def load_model_and_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16,  # Load in fp16 for efficiency
            device_map="auto"  # Automatically use GPU if available
        )

        # 🚀 Ensure model runs in eager mode (no compilation)
        if torch.cuda.is_available():
            model.to("cuda")  # Move model to GPU
        print("✅ Model loaded successfully in eager mode.")
        return model, tokenizer
    except Exception as e:
        print(f"❌ Error loading model/tokenizer: {e}")
        return None, None

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""  # Ensure text is not None
    except Exception as e:
        print(f"❌ Error extracting text from PDF {pdf_path}: {e}")
    return text.strip()

# Function to generate a summary
def generate_summary(input_text, model, tokenizer):
    try:
        prompt = (
            "Below is a legal document. Summarize its key points in a concise manner.\n\n"
            "### Document:\n{input_text}\n\n### Summary:"
        )

        # Ensure input text is within 8192 token limit
        input_str = prompt.format(input_text=input_text[:8192])

        model_inputs = tokenizer(
            input_str,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=8192  # Set explicit max length
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=1024,
            do_sample=True,  # Fix warning by enabling sampling
            top_k=50,
            top_p=0.95,
            temperature=0.5
        )

        summary = tokenizer.decode(summary_output[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"❌ Error generating summary: {e}")
        return ""

# Load the model and tokenizer
model_name = "coderop12/Empowering_Legal_Summarization"  # Replace with your preferred causal LM model
model, tokenizer = load_model_and_tokenizer(model_name)

# ✅ Verify the casefile path exists before processing
pdf_directory = "casefile"  # Your casefile directory

if not os.path.exists(pdf_directory):
    print(f"❌ Error: The specified directory '{pdf_directory}' does not exist.")
    exit(1)  # Stop execution if the folder is missing

if model is not None and tokenizer is not None:
    output_data = []

    # Process each PDF in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            print(f"📄 Processing: {filename}")
            document_text = extract_text_from_pdf(pdf_path)

            if document_text:
                sanitized_text = sanitize_text(document_text)
                summary = generate_summary(sanitized_text, model, tokenizer)
                sanitized_summary = sanitize_text(summary)
                output_data.append({"Filename": filename, "Summary": sanitized_summary})
            else:
                print(f"⚠️ Skipping {filename} - No extractable text found.")

    # Save results to Excel and CSV
    df = pd.DataFrame(output_data)
    output_excel_path = "output_summaries_1.xlsx"
    output_csv_path = "output_summaries_1.csv"

    try:
        df.to_excel(output_excel_path, index=False)
        print(f"✅ Summaries saved to Excel: {output_excel_path}")
    except Exception as e:
        print(f"⚠️ Error saving to Excel ({e}). Falling back to CSV...")
        df.to_csv(output_csv_path, index=False)
        print(f"✅ Summaries saved to CSV instead: {output_csv_path}")

    print("🎉 Processing complete!")
else:
    print("❌ Failed to load model/tokenizer. Exiting.")


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   3%|3         | 168M/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

✅ Model loaded successfully in eager mode.
📄 Processing: converted_text.pdf
⚠️ Error saving to Excel (No module named 'openpyxl'). Falling back to CSV...
✅ Summaries saved to CSV instead: output_summaries_1.csv
🎉 Processing complete!


In [None]:
#Inference run 2
import os
import PyPDF2
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Fully disable Torch Dynamo to prevent errors
import torch._dynamo
torch._dynamo.config.suppress_errors = True
os.environ["TORCH_COMPILE"] = "0"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_INDUCTOR_DISABLED"] = "1"

# Function to sanitize text by removing illegal characters
def sanitize_text(text):
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

# Function to load the model and tokenizer and move the model to GPU
def load_model_and_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16,  # Load in fp16 for efficiency
            device_map="auto"  # Automatically use GPU if available
        )

        if torch.cuda.is_available():
            model.to("cuda")
        print("✅ Model loaded successfully.")
        return model, tokenizer
    except Exception as e:
        print(f"❌ Error loading model/tokenizer: {e}")
        return None, None

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""  # Ensure text is not None
    except Exception as e:
        print(f"❌ Error extracting text from PDF {pdf_path}: {e}")
    return text.strip()

# Function to generate a summary (FIXED)
def generate_summary(input_text, model, tokenizer):
    try:
        prompt = (
            "Below is a legal document. Summarize its key points concisely.\n\n"
            "### Document:\n{input_text}\n\n### Summary:"
        )

        input_str = prompt.format(input_text=input_text[:4096])  # Reduce input size

        model_inputs = tokenizer(
            input_str,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=4096  # Reduce token count to prevent looping
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=256,  # Reduce summary length
            do_sample=True,  # Enable sampling
            top_k=40,  # Reduce likelihood of choosing most probable words
            top_p=0.8,  # More diverse output
            temperature=0.7,  # Increase randomness slightly
            repetition_penalty=1.5,  # Stronger penalty for repeated words
            no_repeat_ngram_size=3  # Prevent repeating 3-word sequences
        )

        summary = tokenizer.decode(summary_output[0], skip_special_tokens=True)
        return summary.strip()
    except Exception as e:
        print(f"❌ Error generating summary: {e}")
        return ""

# Load the model and tokenizer
model_name = "google/gemma-2-2b-it"
model, tokenizer = load_model_and_tokenizer(model_name)

# ✅ Verify the casefile path exists before processing
pdf_directory = "/teamspace/studios/this_studio/casefile"

if not os.path.exists(pdf_directory):
    print(f"❌ Error: The specified directory '{pdf_directory}' does not exist.")
    exit(1)

if model is not None and tokenizer is not None:
    output_data = []

    # Process each PDF in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            print(f"📄 Processing: {filename}")
            document_text = extract_text_from_pdf(pdf_path)

            if document_text:
                sanitized_text = sanitize_text(document_text)
                summary = generate_summary(sanitized_text, model, tokenizer)
                sanitized_summary = sanitize_text(summary)
                output_data.append({"Filename": filename, "Summary": sanitized_summary})
            else:
                print(f"⚠️ Skipping {filename} - No extractable text found.")

    # Save results to Excel and CSV
    df = pd.DataFrame(output_data)
    output_excel_path = "output_summaries_2_finetuned.xlsx"
    output_csv_path = "output_summaries_1.csv"

    try:
        df.to_excel(output_excel_path, index=False)
        print(f"✅ Summaries saved to Excel: {output_excel_path}")
    except Exception as e:
        print(f"⚠️ Error saving to Excel ({e}). Falling back to CSV...")
        df.to_csv(output_csv_path, index=False)
        print(f"✅ Summaries saved to CSV instead: {output_csv_path}")

    print("🎉 Processing complete!")
else:
    print("❌ Failed to load model/tokenizer. Exiting.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model loaded successfully.
📄 Processing: converted_text.pdf
✅ Summaries saved to Excel: output_summaries_2_finetuned.xlsx
🎉 Processing complete!


In [32]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [33]:
#inference run 3
import os
import PyPDF2
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Fully disable Torch Dynamo to prevent errors
import torch._dynamo
torch._dynamo.config.suppress_errors = True
os.environ["TORCH_COMPILE"] = "0"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_INDUCTOR_DISABLED"] = "1"

# Function to sanitize text by removing illegal characters
def sanitize_text(text):
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

# Function to load the model and tokenizer and move the model to GPU
def load_model_and_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16,  # Load in fp16 for efficiency
            device_map="auto"  # Automatically use GPU if available
        )

        if torch.cuda.is_available():
            model.to("cuda")
        print("✅ Model loaded successfully.")
        return model, tokenizer
    except Exception as e:
        print(f"❌ Error loading model/tokenizer: {e}")
        return None, None

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""  # Ensure text is not None
    except Exception as e:
        print(f"❌ Error extracting text from PDF {pdf_path}: {e}")
    return text.strip()

# Function to generate a summary (Updated)
def generate_summary(input_text, model, tokenizer):
    try:
        prompt = (
            "Below is a legal document. Summarize its key points concisely.\n\n"
            "### Document:\n{input_text}\n\n### Summary:"
        )

        # Limit input text to reduce length and avoid looping
        input_str = prompt.format(input_text=input_text[:4096])

        model_inputs = tokenizer(
            input_str,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=4096  # Set explicit max length
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=256,  # Limit summary length
            do_sample=True,      # Enable sampling
            top_k=40,
            top_p=0.8,
            temperature=0.7,
            repetition_penalty=1.5,
            no_repeat_ngram_size=3
        )

        full_output = tokenizer.decode(summary_output[0], skip_special_tokens=True)
        
        # Post-process to extract only the summary
        marker = "### Summary:"
        if marker in full_output:
            # Return text after the marker
            summary = full_output.split(marker, 1)[1].strip()
        else:
            summary = full_output.strip()
            
        return summary
    except Exception as e:
        print(f"❌ Error generating summary: {e}")
        return ""

# Load the model and tokenizer
model_name = "coderop12/Legal_Summarzation_System"
model, tokenizer = load_model_and_tokenizer(model_name)

# ✅ Verify the casefile path exists before processing
pdf_directory = "/teamspace/studios/this_studio/casefile"

if not os.path.exists(pdf_directory):
    print(f"❌ Error: The specified directory '{pdf_directory}' does not exist.")
    exit(1)

if model is not None and tokenizer is not None:
    output_data = []

    # Process each PDF in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            print(f"📄 Processing: {filename}")
            document_text = extract_text_from_pdf(pdf_path)

            if document_text:
                sanitized_text = sanitize_text(document_text)
                summary = generate_summary(sanitized_text, model, tokenizer)
                sanitized_summary = sanitize_text(summary)
                output_data.append({"Filename": filename, "Summary": sanitized_summary})
            else:
                print(f"⚠️ Skipping {filename} - No extractable text found.")

    # Save results to Excel and CSV
    df = pd.DataFrame(output_data)
    output_excel_path = "output_summaries_3_finetuned.xlsx"
    output_csv_path = "output_summaries_3_finetuned.csv"

    try:
        df.to_excel(output_excel_path, index=False)
        print(f"✅ Summaries saved to Excel: {output_excel_path}")
    except Exception as e:
        print(f"⚠️ Error saving to Excel ({e}). Falling back to CSV...")
        df.to_csv(output_csv_path, index=False)
        print(f"✅ Summaries saved to CSV instead: {output_csv_path}")

    print("🎉 Processing complete!")
else:
    print("❌ Failed to load model/tokenizer. Exiting.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model loaded successfully.
📄 Processing: converted_text.pdf
✅ Summaries saved to Excel: output_summaries_3_finetuned.xlsx
🎉 Processing complete!


In [None]:
pip install rouge-score sentence-transformers


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting nltk (from rouge-score)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m153.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24986 sha256=d6718ef18288e4c6ff37f5d57f84348293a1027b95223fa5967ee9d91334c471
  Stored in directory: /home/zeus/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing col

In [None]:
from rouge import Rouge
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Gold standard summary (ChatGPT)
gold_summary = """The Court ruled that the respondents are entitled to seek a pre-deposit of only 20% of the disputed tax demand, subject to compliance with W.P. (C) 14536/2021. It highlighted that the Central Board of Direct Taxes (CBDT) has issued circulars and office memorandums, including those dated 29th February 2016 and 31st July 2017, outlining guidelines for granting a stay on tax demand recovery. These guidelines mandate that if an assessee appeals an assessment order before the first appellate authority (CIT(A)) and deposits 20% of the disputed demand, they should not be required to pay beyond this amount unless specific exceptions apply.

The Court found that the petitioner was not given a pre-decisional hearing before the tax refund was adjusted, entitling them to a refund of any excess adjustments beyond 20% of the disputed demand. The Assessing Officer has the authority to grant a stay on the recovery of the remaining balance. Additionally, in cases where a stay is granted, the excess amount can be refunded manually as per ITBA Assessment Instruction No. 11. The Court concluded that the respondents violated the provisions of the Office Memorandums, which mandate granting stays based on subsequent year refund adjustments. The government must adhere to its own rules and guidelines, failing which such actions could be invalidated."""

# Model-generated summary
model_summary = """This judgment outlines specific guidelines related to how taxpayers facing assessments disputes should handle those situations when seeking relief against added taxes levied within pending appeals process.  
Key takeaways include:* **Pre-deposit requirement:** Taxpayers must pay at least 2 out of every 10 taxed dollars owed if appealing income tax decisions but may request additional support based upon individual circumstances under certain conditions outlined below. This payment does not apply directly to future year's liabilities until after review proceedings have concluded.
**Additional Details & Supporting Context**: It appears there were issues surrounding whether these payments met criteria established earlier by government regulations concerning requests for staying debt collection while challenging assessed amounts via formal avenues like Appeals Hearings. In some instances, individuals might face automatic reduction in available funds even though ongoing litigation continues; however, it seems clear now that courts will ensure fairness across all parties involved throughout each stage - ensuring both taxpayer rights AND proper administration of revenue collections.**"""

# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(model_summary, gold_summary, avg=True)

# Load SentenceTransformer for semantic similarity
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
gold_embedding = model.encode(gold_summary, convert_to_tensor=True)
model_embedding = model.encode(model_summary, convert_to_tensor=True)

# Compute cosine similarity
cosine_sim = util.pytorch_cos_sim(gold_embedding, model_embedding).item()

# Display results
{
    "ROUGE-1": rouge_scores["rouge-1"]["f"],
    "ROUGE-2": rouge_scores["rouge-2"]["f"],
    "ROUGE-L": rouge_scores["rouge-l"]["f"],
    "Cosine Similarity": cosine_sim
}


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{'ROUGE-1': 0.13468012970513232,
 'ROUGE-2': 0.0,
 'ROUGE-L': 0.10101009603509871,
 'Cosine Similarity': 0.6975638270378113}

In [None]:
from rouge import Rouge
from sentence_transformers import SentenceTransformer, util

# Updated system summary with fine-tuned and ChatGPT summaries
model_summary = """This judgment outlines specific guidelines related to how taxpayers facing assessments disputes should handle those situations when seeking relief against added taxes levied within pending appeals process.  
Key takeaways include:  
* **Pre-deposit requirement:** Taxpayers must pay at least 2 out of every 10 taxed dollars owed if appealing income tax decisions but may request additional support based upon individual circumstances under certain conditions outlined below. This payment does not apply directly to future year's liabilities until after review proceedings have concluded.  

**Additional Details & Supporting Context**:  
It appears there were issues surrounding whether these payments met criteria established earlier by government regulations concerning requests for staying debt collection while challenging assessed amounts via formal avenues like Appeals Hearings. In some instances, individuals might face automatic reduction in available funds even though ongoing litigation continues; however, it seems clear now that courts will ensure fairness across all parties involved throughout each stage - ensuring both taxpayer rights AND proper administration of revenue collections.  

**ChatGPT Summary (Court Ruling Summary):**  
The court ruled that tax authorities can seek only **20% of the disputed demand** as a pre-deposit during appeals, and any excess adjustments must be refunded. **Assessing Officers have the authority to grant a stay** on recovery. The government must adhere to **CBDT guidelines**, failing which actions may be invalidated.
"""

# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(model_summary, gold_summary, avg=True)

# Load SentenceTransformer for semantic similarity
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
gold_embedding = model.encode(gold_summary, convert_to_tensor=True)
model_embedding = model.encode(model_summary, convert_to_tensor=True)

# Compute cosine similarity
cosine_sim = util.pytorch_cos_sim(gold_embedding, model_embedding).item()

# Display results
{
    "ROUGE-1": rouge_scores["rouge-1"]["f"],
    "ROUGE-2": rouge_scores["rouge-2"]["f"],
    "ROUGE-L": rouge_scores["rouge-l"]["f"],
    "Cosine Similarity": cosine_sim
}


{'ROUGE-1': 0.26548672083640074,
 'ROUGE-2': 0.08252426685891726,
 'ROUGE-L': 0.2182890807184067,
 'Cosine Similarity': 0.6975638270378113}

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
!pip install openpyxl



In [None]:
from huggingface_hub import delete_repo, login

# 1. Log in with your Hugging Face token

# 2. Delete the repository
repo_id = "coderop12/Legal_Summarzation_System"  # "username/repo_name"
delete_repo(repo_id=repo_id, repo_type="model")

print(f"Repo '{repo_id}' has been deleted.")


Repo 'coderop12/Legal_Summarzation_System' has been deleted.


In [2]:
!pip install optuna


Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading SQLAlchemy-2.0.38-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
Downloading alembic-1.14.1-py3-none-any.whl (233 kB)
Downloading SQLAlchemy-2.0.38-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m180.4 MB/s

In [None]:
import os
import PyPDF2
import pandas as pd
import torch
import re
import difflib
import optuna
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Fully disable Torch Dynamo to prevent errors
import torch._dynamo
torch._dynamo.config.suppress_errors = True
os.environ["TORCH_COMPILE"] = "0"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_INDUCTOR_DISABLED"] = "1"

# Function to sanitize text by removing illegal characters
def sanitize_text(text):
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

# Function to load the model and tokenizer and move the model to GPU
def load_model_and_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16,  # Load in fp16 for efficiency
            device_map="auto"  # Automatically use GPU if available
        )

        if torch.cuda.is_available():
            model.to("cuda")
        print("✅ Model loaded successfully.")
        return model, tokenizer
    except Exception as e:
        print(f"❌ Error loading model/tokenizer: {e}")
        return None, None

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""  # Ensure text is not None
    except Exception as e:
        print(f"❌ Error extracting text from PDF {pdf_path}: {e}")
    return text.strip()

# Function to generate summary with tunable parameters
def generate_summary_custom(input_text, model, tokenizer,
                            top_k, top_p, temperature, repetition_penalty, no_repeat_ngram_size):
    try:
        prompt = (
            "Below is a legal document. Summarize its key points concisely.\n\n"
            "### Document:\n{input_text}\n\n### Summary:"
        )
        # Limit input text to reduce length and avoid looping
        input_str = prompt.format(input_text=input_text[:4096])
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model_inputs = tokenizer(
            input_str,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=4096
        ).to(device)

        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=256,  # Limit summary length
            do_sample=True,      # Enable sampling
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            no_repeat_ngram_size=no_repeat_ngram_size
        )

        full_output = tokenizer.decode(summary_output[0], skip_special_tokens=True)
        # Post-process to extract only the summary
        marker = "### Summary:"
        if marker in full_output:
            summary = full_output.split(marker, 1)[1].strip()
        else:
            summary = full_output.strip()
        return summary
    except Exception as e:
        print(f"❌ Error generating summary: {e}")
        return ""

# For production, use default generation parameters
def generate_summary(input_text, model, tokenizer):
    return generate_summary_custom(
        input_text, model, tokenizer,
        top_k=40,
        top_p=0.8,
        temperature=0.7,
        repetition_penalty=1.5,
        no_repeat_ngram_size=3
    )

# Load the model and tokenizer
model_name = "coderop12/Legal_Summarzation_System"
model, tokenizer = load_model_and_tokenizer(model_name)

# ✅ Verify the casefile path exists before processing
pdf_directory = "/teamspace/studios/this_studio/casefile"

if not os.path.exists(pdf_directory):
    print(f"❌ Error: The specified directory '{pdf_directory}' does not exist.")
    exit(1)

# =======================
# Hyperparameter Tuning
# =======================

# For hyperparameter tuning, we assume you have at least one sample document and a reference summary.
# Replace these with your actual validation examples.
sample_document = "Your sample legal document text goes here. This should be representative of your documents."
reference_summary = "Your expected concise summary for the sample document."

def objective(trial):
    # Define the hyperparameter search space
    top_k = trial.suggest_int("top_k", 20, 100)
    top_p = trial.suggest_uniform("top_p", 0.6, 0.95)
    temperature = trial.suggest_loguniform("temperature", 0.5, 1.5)
    repetition_penalty = trial.suggest_uniform("repetition_penalty", 1.0, 2.0)
    no_repeat_ngram_size = trial.suggest_int("no_repeat_ngram_size", 2, 5)

    # Generate summary using the custom function
    summary = generate_summary_custom(
        sample_document, model, tokenizer,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size
    )
    
    # Calculate a simple similarity score using difflib
    # (Replace with a more robust metric for real evaluation, e.g., ROUGE)
    score = difflib.SequenceMatcher(None, summary, reference_summary).ratio()
    print(f"Trial {trial.number}: score={score:.4f} with params: top_k={top_k}, top_p={top_p:.2f}, temperature={temperature:.2f}, repetition_penalty={repetition_penalty:.2f}, no_repeat_ngram_size={no_repeat_ngram_size}")
    return score

# Run hyperparameter tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
print("Best hyperparameters found:", study.best_trial.params)

# Optionally, you can update your production generation parameters based on the best trial.
best_params = study.best_trial.params

# =======================
# Inference / Processing PDFs
# =======================

if model is not None and tokenizer is not None:
    output_data = []

    # Process each PDF in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            print(f"📄 Processing: {filename}")
            document_text = extract_text_from_pdf(pdf_path)

            if document_text:
                sanitized_text = sanitize_text(document_text)
                # Use best parameters found from tuning
                summary = generate_summary_custom(
                    sanitized_text, model, tokenizer,
                    top_k=best_params.get("top_k", 40),
                    top_p=best_params.get("top_p", 0.8),
                    temperature=best_params.get("temperature", 0.7),
                    repetition_penalty=best_params.get("repetition_penalty", 1.5),
                    no_repeat_ngram_size=best_params.get("no_repeat_ngram_size", 3)
                )
                sanitized_summary = sanitize_text(summary)
                output_data.append({"Filename": filename, "Summary": sanitized_summary})
            else:
                print(f"⚠️ Skipping {filename} - No extractable text found.")

    # Save results to Excel and CSV
    df = pd.DataFrame(output_data)
    output_excel_path = "output_summaries_3_finetuned.xlsx"
    output_csv_path = "output_summaries_3_finetuned.csv"

    try:
        df.to_excel(output_excel_path, index=False)
        print(f"✅ Summaries saved to Excel: {output_excel_path}")
    except Exception as e:
        print(f"⚠️ Error saving to Excel ({e}). Falling back to CSV...")
        df.to_csv(output_csv_path, index=False)
        print(f"✅ Summaries saved to CSV instead: {output_csv_path}")

    print("🎉 Processing complete!")
else:
    print("❌ Failed to load model/tokenizer. Exiting.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-02-28 06:26:54,475] A new study created in memory with name: no-name-b8f6e9f7-6c3c-4c3e-999c-583bed827d41


✅ Model loaded successfully.


  top_p = trial.suggest_uniform("top_p", 0.6, 0.95)
  temperature = trial.suggest_loguniform("temperature", 0.5, 1.5)
  repetition_penalty = trial.suggest_uniform("repetition_penalty", 1.0, 2.0)
[I 2025-02-28 06:27:04,289] Trial 0 finished with value: 0.026871401151631478 and parameters: {'top_k': 46, 'top_p': 0.8520246039900785, 'temperature': 1.350824527826988, 'repetition_penalty': 1.3364138564986499, 'no_repeat_ngram_size': 5}. Best is trial 0 with value: 0.026871401151631478.


Trial 0: score=0.0269 with params: top_k=46, top_p=0.85, temperature=1.35, repetition_penalty=1.34, no_repeat_ngram_size=5


[I 2025-02-28 06:27:11,817] Trial 1 finished with value: 0.054673721340388004 and parameters: {'top_k': 59, 'top_p': 0.8161857308415474, 'temperature': 0.9042019985067397, 'repetition_penalty': 1.6515748797760212, 'no_repeat_ngram_size': 5}. Best is trial 1 with value: 0.054673721340388004.


Trial 1: score=0.0547 with params: top_k=59, top_p=0.82, temperature=0.90, repetition_penalty=1.65, no_repeat_ngram_size=5


[I 2025-02-28 06:27:20,815] Trial 2 finished with value: 0.05259515570934256 and parameters: {'top_k': 52, 'top_p': 0.6724843763401622, 'temperature': 1.4407423713154277, 'repetition_penalty': 1.4313346041230484, 'no_repeat_ngram_size': 5}. Best is trial 1 with value: 0.054673721340388004.


Trial 2: score=0.0526 with params: top_k=52, top_p=0.67, temperature=1.44, repetition_penalty=1.43, no_repeat_ngram_size=5


[I 2025-02-28 06:27:24,685] Trial 3 finished with value: 0.10950080515297907 and parameters: {'top_k': 54, 'top_p': 0.9304373287991912, 'temperature': 1.119390927509042, 'repetition_penalty': 1.4365684393978444, 'no_repeat_ngram_size': 2}. Best is trial 3 with value: 0.10950080515297907.


Trial 3: score=0.1095 with params: top_k=54, top_p=0.93, temperature=1.12, repetition_penalty=1.44, no_repeat_ngram_size=2


[I 2025-02-28 06:27:33,657] Trial 4 finished with value: 0.05166051660516605 and parameters: {'top_k': 33, 'top_p': 0.8925898504371182, 'temperature': 0.6944585918616094, 'repetition_penalty': 1.7380656624655435, 'no_repeat_ngram_size': 4}. Best is trial 3 with value: 0.10950080515297907.


Trial 4: score=0.0517 with params: top_k=33, top_p=0.89, temperature=0.69, repetition_penalty=1.74, no_repeat_ngram_size=4


[I 2025-02-28 06:27:34,877] Trial 5 finished with value: 0.18840579710144928 and parameters: {'top_k': 43, 'top_p': 0.7673380052064728, 'temperature': 0.6111679958539, 'repetition_penalty': 1.5219379518127714, 'no_repeat_ngram_size': 5}. Best is trial 5 with value: 0.18840579710144928.


Trial 5: score=0.1884 with params: top_k=43, top_p=0.77, temperature=0.61, repetition_penalty=1.52, no_repeat_ngram_size=5


[I 2025-02-28 06:27:43,925] Trial 6 finished with value: 0.06438068579426172 and parameters: {'top_k': 47, 'top_p': 0.9094564846480873, 'temperature': 0.9857312638025773, 'repetition_penalty': 1.9997304652653645, 'no_repeat_ngram_size': 3}. Best is trial 5 with value: 0.18840579710144928.


Trial 6: score=0.0644 with params: top_k=47, top_p=0.91, temperature=0.99, repetition_penalty=2.00, no_repeat_ngram_size=3


[I 2025-02-28 06:27:44,664] Trial 7 finished with value: 0.15757575757575756 and parameters: {'top_k': 44, 'top_p': 0.6629415771431802, 'temperature': 0.6162886842856679, 'repetition_penalty': 1.945508888631821, 'no_repeat_ngram_size': 2}. Best is trial 5 with value: 0.18840579710144928.


Trial 7: score=0.1576 with params: top_k=44, top_p=0.66, temperature=0.62, repetition_penalty=1.95, no_repeat_ngram_size=2


[I 2025-02-28 06:27:53,699] Trial 8 finished with value: 0.026817219477769938 and parameters: {'top_k': 95, 'top_p': 0.9092888192864168, 'temperature': 0.8070390291298929, 'repetition_penalty': 1.6664228113166155, 'no_repeat_ngram_size': 5}. Best is trial 5 with value: 0.18840579710144928.


Trial 8: score=0.0268 with params: top_k=95, top_p=0.91, temperature=0.81, repetition_penalty=1.67, no_repeat_ngram_size=5


[I 2025-02-28 06:27:55,491] Trial 9 finished with value: 0.2059800664451827 and parameters: {'top_k': 97, 'top_p': 0.627233300728278, 'temperature': 0.722404570283385, 'repetition_penalty': 1.2254330744835047, 'no_repeat_ngram_size': 4}. Best is trial 9 with value: 0.2059800664451827.


Trial 9: score=0.2060 with params: top_k=97, top_p=0.63, temperature=0.72, repetition_penalty=1.23, no_repeat_ngram_size=4


[I 2025-02-28 06:27:57,577] Trial 10 finished with value: 0.23100303951367782 and parameters: {'top_k': 99, 'top_p': 0.6193927634347278, 'temperature': 0.5279946270879468, 'repetition_penalty': 1.0449715770159638, 'no_repeat_ngram_size': 3}. Best is trial 10 with value: 0.23100303951367782.


Trial 10: score=0.2310 with params: top_k=99, top_p=0.62, temperature=0.53, repetition_penalty=1.04, no_repeat_ngram_size=3


[I 2025-02-28 06:27:58,982] Trial 11 finished with value: 0.2938775510204082 and parameters: {'top_k': 99, 'top_p': 0.6051211262385418, 'temperature': 0.5171226845603978, 'repetition_penalty': 1.1163889069525519, 'no_repeat_ngram_size': 3}. Best is trial 11 with value: 0.2938775510204082.


Trial 11: score=0.2939 with params: top_k=99, top_p=0.61, temperature=0.52, repetition_penalty=1.12, no_repeat_ngram_size=3


[I 2025-02-28 06:28:04,110] Trial 12 finished with value: 0.03337969401947149 and parameters: {'top_k': 79, 'top_p': 0.6021791361437088, 'temperature': 0.5256420204394749, 'repetition_penalty': 1.0399139991749244, 'no_repeat_ngram_size': 3}. Best is trial 11 with value: 0.2938775510204082.


Trial 12: score=0.0334 with params: top_k=79, top_p=0.60, temperature=0.53, repetition_penalty=1.04, no_repeat_ngram_size=3


[I 2025-02-28 06:28:13,279] Trial 13 finished with value: 0.0610079575596817 and parameters: {'top_k': 79, 'top_p': 0.6998162176886097, 'temperature': 0.5034830038538135, 'repetition_penalty': 1.068273470658218, 'no_repeat_ngram_size': 3}. Best is trial 11 with value: 0.2938775510204082.


Trial 13: score=0.0610 with params: top_k=79, top_p=0.70, temperature=0.50, repetition_penalty=1.07, no_repeat_ngram_size=3


[I 2025-02-28 06:28:22,594] Trial 14 finished with value: 0.03838517538054269 and parameters: {'top_k': 82, 'top_p': 0.7438720330425465, 'temperature': 0.6030557222751808, 'repetition_penalty': 1.1986732273287557, 'no_repeat_ngram_size': 3}. Best is trial 11 with value: 0.2938775510204082.


Trial 14: score=0.0384 with params: top_k=82, top_p=0.74, temperature=0.60, repetition_penalty=1.20, no_repeat_ngram_size=3


[I 2025-02-28 06:28:24,048] Trial 15 finished with value: 0.2302158273381295 and parameters: {'top_k': 70, 'top_p': 0.7264428361229209, 'temperature': 0.5544254152435644, 'repetition_penalty': 1.1781399602991156, 'no_repeat_ngram_size': 2}. Best is trial 11 with value: 0.2938775510204082.


Trial 15: score=0.2302 with params: top_k=70, top_p=0.73, temperature=0.55, repetition_penalty=1.18, no_repeat_ngram_size=2


[I 2025-02-28 06:28:33,049] Trial 16 finished with value: 0.05714285714285714 and parameters: {'top_k': 89, 'top_p': 0.6387680280125874, 'temperature': 0.7107545920284921, 'repetition_penalty': 1.007995645379109, 'no_repeat_ngram_size': 4}. Best is trial 11 with value: 0.2938775510204082.


Trial 16: score=0.0571 with params: top_k=89, top_p=0.64, temperature=0.71, repetition_penalty=1.01, no_repeat_ngram_size=4


[I 2025-02-28 06:28:34,778] Trial 17 finished with value: 0.24918032786885247 and parameters: {'top_k': 100, 'top_p': 0.6057870800150593, 'temperature': 0.5703089036934398, 'repetition_penalty': 1.3048363195513406, 'no_repeat_ngram_size': 3}. Best is trial 11 with value: 0.2938775510204082.


Trial 17: score=0.2492 with params: top_k=100, top_p=0.61, temperature=0.57, repetition_penalty=1.30, no_repeat_ngram_size=3


[I 2025-02-28 06:28:36,940] Trial 18 finished with value: 0.18848167539267016 and parameters: {'top_k': 21, 'top_p': 0.686913519884336, 'temperature': 0.78869188121657, 'repetition_penalty': 1.3342435921771238, 'no_repeat_ngram_size': 4}. Best is trial 11 with value: 0.2938775510204082.


Trial 18: score=0.1885 with params: top_k=21, top_p=0.69, temperature=0.79, repetition_penalty=1.33, no_repeat_ngram_size=4


[I 2025-02-28 06:28:45,933] Trial 19 finished with value: 0.06597222222222222 and parameters: {'top_k': 69, 'top_p': 0.8032778940009122, 'temperature': 0.6611777023405808, 'repetition_penalty': 1.2966563033930394, 'no_repeat_ngram_size': 2}. Best is trial 11 with value: 0.2938775510204082.


Trial 19: score=0.0660 with params: top_k=69, top_p=0.80, temperature=0.66, repetition_penalty=1.30, no_repeat_ngram_size=2
Best hyperparameters found: {'top_k': 99, 'top_p': 0.6051211262385418, 'temperature': 0.5171226845603978, 'repetition_penalty': 1.1163889069525519, 'no_repeat_ngram_size': 3}
📄 Processing: converted_text.pdf
✅ Summaries saved to Excel: output_summaries_3_finetuned.xlsx
🎉 Processing complete!


In [9]:
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

# Replace these with your actual summaries.
generated_summary = (
    "This judgment deals with the issue of whether the Assessing Officers have the power under section 263 of the "
    "Income Tax Act, 1961 to adjust the refund of the previous year against the current year’s liability. The High Court "
    "of Delhi observed that it is not permissible to make any adjustment of the refund without giving any opportunity of "
    "hearing to the taxpayer. It also observed that if the taxpayer makes a payment towards the disputed amount, then he "
    "will be entitled to get his refund adjusted accordingly. The Tribunal relied upon the judgments of the Supreme Court "
    "in the matter of M/S. Reliance Industries Ltd. v. Union of India & Ors. [W.P.(C) No. 10719 of 18] and others wherein "
    "the Hon’ble Supreme Court held, “It is imperative that the Department should not proceed with the recovery proceedings "
    "without affording an opportunity of being heard to the assesse.” The Hon’able High Court observed: “The facts of the "
    "present case show that the Assessing officer did not give any opportunity to the Petitioner to explain his position and "
    "therefore, the impug"  # Note: Generated summary appears truncated.
)

# Provide your complete reference summary here.
reference_summary = (
    "This judgment deals with the issue of whether the Assessing Officers have the power under section 263 of the Income Tax "
    "Act, 1961 to adjust the refund of the previous year against the current year’s liability. The High Court of Delhi observed "
    "that it is not permissible to make any adjustment of the refund without giving any opportunity of hearing to the taxpayer. "
    "It also observed that if the taxpayer makes a payment towards the disputed amount, then he will be entitled to get his refund "
    "adjusted accordingly. The Tribunal relied upon the judgments of the Supreme Court in the matter of M/S. Reliance Industries Ltd. "
    "v. Union of India & Ors. [W.P.(C) No. 10719 of 18] and others, wherein the Hon’ble Supreme Court held, “It is imperative that the "
    "Department should not proceed with the recovery proceedings without affording an opportunity of being heard to the assesse.” "
    "The Hon’able High Court further observed that the petitioner's right to a hearing was violated, and accordingly, the orders "
    "passed were invalid."
)

def compare_summaries(generated, reference):
    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, generated)
    
    # Compute cosine similarity between summaries using SentenceTransformer
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = embedder.encode([generated, reference])
    cosine_similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
    
    return rouge_scores, cosine_similarity

# Evaluate the summaries.
scores, cosine_sim = compare_summaries(generated_summary, reference_summary)

print("ROUGE Scores:")
for metric, score in scores.items():
    print(f"  {metric}: F1 = {score.fmeasure:.4f}, Precision = {score.precision:.4f}, Recall = {score.recall:.4f}")

print(f"\nCosine Similarity: {cosine_sim:.4f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ROUGE Scores:
  rouge1: F1 = 0.9086, Precision = 0.8865, Recall = 0.9318
  rouge2: F1 = 0.8802, Precision = 0.8587, Recall = 0.9029
  rougeL: F1 = 0.9086, Precision = 0.8865, Recall = 0.9318

Cosine Similarity: 0.9847


In [10]:
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util

# Base summary (as provided)
base_summary = (
    "The court held that respondents are entitled only to a pre-deposit of 20% of the disputed tax demand, "
    "provided that the petitioner receives an opportunity for a pre-decisional hearing. Citing various circulars "
    "and office memorandums issued between February 2016 and July 2017, the court emphasized that in cases where "
    "an assessee challenges disallowances without a proper hearing, any refund adjustments beyond 20% must be returned "
    "until the first appeal is resolved. Additionally, assessing officers have the authority to grant a stay on the recovery "
    "of the remaining demand and process manual refunds in line with ITBA assessment instructions. The court concluded "
    "that failure to adhere to these established procedures would render the actions of the respondents invalid."
)

# "My summary" provided (note: it is truncated)
my_summary = (
    "This judgment deals with the issue of whether the Assessing Officers have the power under section 263 of the Income Tax Act, "
    "1961 to adjust the refund of the previous year against the current year’s liability. The High Court of Delhi observed that it "
    "is not permissible to make any adjustment of the refund without giving any opportunity of hearing to the taxpayer. It also observed "
    "that if the taxpayer makes a payment towards the disputed amount, then he will be entitled to get his refund adjusted accordingly. "
    "The Tribunal relied upon the judgments of the Supreme Court in the matter of M/S. Reliance Industries Ltd. v. Union of India & Ors. "
    "[W.P.(C) No. 10719 of 18] and others wherein the Hon’ble Supreme Court held, “It is imperative that the Department should not proceed "
    "with the recovery proceedings without affording an opportunity of being heard to the assesse.” The Hon’able High Court observed: “The "
    "facts of the present case show that the Assessing officer did not give any opportunity to the Petitioner to explain his position and therefore, "
    "the impug"
)

def compare_summaries(summary1, summary2):
    # Calculate ROUGE scores (using ROUGE-1, ROUGE-2, and ROUGE-L)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(summary1, summary2)
    
    # Calculate cosine similarity using a pre-trained SentenceTransformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode([summary1, summary2])
    cosine_similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
    
    return rouge_scores, cosine_similarity

# Compare the base summary with "my summary"
scores, cosine_sim = compare_summaries(base_summary, my_summary)

# Print the evaluation results
print("ROUGE Scores:")
for metric, score in scores.items():
    print(f"  {metric}: F1 = {score.fmeasure:.4f}, Precision = {score.precision:.4f}, Recall = {score.recall:.4f}")

print(f"\nCosine Similarity: {cosine_sim:.4f}")


ROUGE Scores:
  rouge1: F1 = 0.3553, Precision = 0.2919, Recall = 0.4538
  rouge2: F1 = 0.0861, Precision = 0.0707, Recall = 0.1102
  rougeL: F1 = 0.1842, Precision = 0.1514, Recall = 0.2353

Cosine Similarity: 0.6897


In [8]:
!pip install sentence_transformers
!pip install rouge_score

Collecting sentence_transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-3.4.1


In [1]:
import os
# Disable TorchDynamo completely before importing torch
os.environ["TORCH_COMPILE"] = "0"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_INDUCTOR_DISABLED"] = "1"

import torch
import torch._dynamo
# Also suppress errors (as a fallback)
torch._dynamo.config.suppress_errors = True

import re
import optuna
from transformers import AutoTokenizer, AutoModelForCausalLM
from rouge_score import rouge_scorer

# -------------------------------
# Load model and tokenizer
# -------------------------------
model_name = "coderop12/Empowering_Legal_Summarization"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
if torch.cuda.is_available():
    model.to("cuda")
print("✅ Model and tokenizer loaded successfully.")

# -------------------------------
# Define generation function (runs in eager mode)
# -------------------------------
def generate_summary_custom(input_text, model, tokenizer,
                            top_k, top_p, temperature, repetition_penalty, no_repeat_ngram_size, max_new_tokens=256):
    prompt = (
        "Below is a legal document. Summarize its key points concisely.\n\n"
        "### Document:\n{input_text}\n\n### Summary:"
    )
    # Limit input text length to avoid exceeding model limits.
    input_str = prompt.format(input_text=input_text[:4096])
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_inputs = tokenizer(
        input_str,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=4096
    ).to(device)
    
    summary_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    marker = "### Summary:"
    if marker in summary:
        summary = summary.split(marker, 1)[1].strip()
    return summary

# -------------------------------
# Define evaluation function using ROUGE-1 F1 and a length penalty
# -------------------------------
def evaluate_summary(generated, reference, target_length=100):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    rouge1_f1 = scores['rouge1'].fmeasure
    
    # Apply a penalty if the generated summary exceeds target_length words
    generated_length = len(re.findall(r'\w+', generated))
    penalty = 0.0
    if generated_length > target_length:
        penalty = 0.01 * (generated_length - target_length)
    
    return rouge1_f1 - penalty

# -------------------------------
# Sample document and reference summary for tuning
# -------------------------------
sample_document = (
    "The court observed that the respondents are entitled to seek pre deposit of only 20% of the disputed demand subject to "
    "fulfillment of W.P. (C) 14536 2021. The court observed, “The Court observed that in order to provide guidance and lay down "
    "principles regarding stay of demand, the Central Board of Direct Taxes has issued various Circulars and Office Memorandums "
    "dated 29th February 2016 and 31st July 2017 prescribing that in cases where an assessee challenges additions and disallowances..."
)

reference_summary = (
    "The court held that respondents are entitled only to a pre-deposit of 20% of the disputed tax demand, provided that the petitioner receives "
    "an opportunity for a pre-decisional hearing. Citing various circulars and office memorandums issued between February 2016 and July 2017, "
    "the court emphasized that any refund adjustments beyond 20% must be returned until the first appeal is resolved. Assessing officers may "
    "grant a stay on the recovery of the remaining demand and process manual refunds in line with ITBA instructions."
)

# -------------------------------
# Define the objective function for hyperparameter tuning
# -------------------------------
def objective(trial):
    # Sample hyperparameters
    top_k = trial.suggest_int("top_k", 20, 100)
    top_p = trial.suggest_float("top_p", 0.6, 0.95)
    temperature = trial.suggest_float("temperature", 0.5, 1.5, log=True)
    repetition_penalty = trial.suggest_float("repetition_penalty", 1.0, 2.0)
    no_repeat_ngram_size = trial.suggest_int("no_repeat_ngram_size", 2, 5)
    max_new_tokens = trial.suggest_int("max_new_tokens", 128, 256)
    
    # Generate the summary
    summary = generate_summary_custom(
        sample_document, model, tokenizer,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        max_new_tokens=max_new_tokens
    )
    
    # Evaluate the summary using ROUGE-1 F1 with a length penalty
    score = evaluate_summary(summary, reference_summary, target_length=100)
    
    print(f"Trial {trial.number}: score={score:.4f}, length={len(summary.split())}, "
          f"params: top_k={top_k}, top_p={top_p:.2f}, temp={temperature:.2f}, rep_pen={repetition_penalty:.2f}, "
          f"no_rep_ngram={no_repeat_ngram_size}, max_tokens={max_new_tokens}")
    
    return score

# -------------------------------
# Run hyperparameter tuning with Optuna
# -------------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best hyperparameters found:", study.best_trial.params)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-02-28 07:12:04,221] A new study created in memory with name: no-name-acfff696-d057-421d-a832-cb750e94fe31


✅ Model and tokenizer loaded successfully.


[I 2025-02-28 07:12:13,379] Trial 0 finished with value: -0.7236059479553902 and parameters: {'top_k': 74, 'top_p': 0.8224207787265655, 'temperature': 0.6904628500724943, 'repetition_penalty': 1.7702606234337228, 'no_repeat_ngram_size': 3, 'max_new_tokens': 248}. Best is trial 0 with value: -0.7236059479553902.


Trial 0: score=-0.7236, length=183, params: top_k=74, top_p=0.82, temp=0.69, rep_pen=1.77, no_rep_ngram=3, max_tokens=248


[I 2025-02-28 07:12:19,264] Trial 1 finished with value: 0.008118811881188126 and parameters: {'top_k': 43, 'top_p': 0.8961012983024321, 'temperature': 0.561663586916444, 'repetition_penalty': 1.804872599464883, 'no_repeat_ngram_size': 2, 'max_new_tokens': 168}. Best is trial 1 with value: 0.008118811881188126.


Trial 1: score=0.0081, length=105, params: top_k=43, top_p=0.90, temp=0.56, rep_pen=1.80, no_rep_ngram=2, max_tokens=168


[I 2025-02-28 07:12:26,373] Trial 2 finished with value: -0.11834123222748819 and parameters: {'top_k': 36, 'top_p': 0.7284952385214016, 'temperature': 0.821537937879596, 'repetition_penalty': 1.9767877548698336, 'no_repeat_ngram_size': 2, 'max_new_tokens': 198}. Best is trial 1 with value: 0.008118811881188126.


Trial 2: score=-0.1183, length=124, params: top_k=36, top_p=0.73, temp=0.82, rep_pen=1.98, no_rep_ngram=2, max_tokens=198


[I 2025-02-28 07:12:32,039] Trial 3 finished with value: -0.3033333333333334 and parameters: {'top_k': 67, 'top_p': 0.6203791803811505, 'temperature': 0.5983036352833041, 'repetition_penalty': 1.9147372282093063, 'no_repeat_ngram_size': 3, 'max_new_tokens': 161}. Best is trial 1 with value: 0.008118811881188126.


Trial 3: score=-0.3033, length=137, params: top_k=67, top_p=0.62, temp=0.60, rep_pen=1.91, no_rep_ngram=3, max_tokens=161


[I 2025-02-28 07:12:37,223] Trial 4 finished with value: -0.032364532019704434 and parameters: {'top_k': 26, 'top_p': 0.7670057081857935, 'temperature': 0.9037083247385638, 'repetition_penalty': 1.9427089014327055, 'no_repeat_ngram_size': 5, 'max_new_tokens': 145}. Best is trial 1 with value: 0.008118811881188126.


Trial 4: score=-0.0324, length=112, params: top_k=26, top_p=0.77, temp=0.90, rep_pen=1.94, no_rep_ngram=5, max_tokens=145


[I 2025-02-28 07:12:43,786] Trial 5 finished with value: -0.24821428571428575 and parameters: {'top_k': 41, 'top_p': 0.8336285864446431, 'temperature': 0.6794788985132385, 'repetition_penalty': 1.6720131401087253, 'no_repeat_ngram_size': 4, 'max_new_tokens': 184}. Best is trial 1 with value: 0.008118811881188126.


Trial 5: score=-0.2482, length=140, params: top_k=41, top_p=0.83, temp=0.68, rep_pen=1.67, no_rep_ngram=4, max_tokens=184


[I 2025-02-28 07:12:52,346] Trial 6 finished with value: -0.5927131782945736 and parameters: {'top_k': 68, 'top_p': 0.9251203638692991, 'temperature': 1.449199550026813, 'repetition_penalty': 1.3585231524984138, 'no_repeat_ngram_size': 3, 'max_new_tokens': 241}. Best is trial 1 with value: 0.008118811881188126.


Trial 6: score=-0.5927, length=170, params: top_k=68, top_p=0.93, temp=1.45, rep_pen=1.36, no_rep_ngram=3, max_tokens=241


[I 2025-02-28 07:13:00,783] Trial 7 finished with value: -0.7108823529411765 and parameters: {'top_k': 73, 'top_p': 0.7931652334514427, 'temperature': 0.9764662956751525, 'repetition_penalty': 1.8275762863954075, 'no_repeat_ngram_size': 4, 'max_new_tokens': 235}. Best is trial 1 with value: 0.008118811881188126.


Trial 7: score=-0.7109, length=181, params: top_k=73, top_p=0.79, temp=0.98, rep_pen=1.83, no_rep_ngram=4, max_tokens=235


[I 2025-02-28 07:13:07,512] Trial 8 finished with value: -0.34434599156118145 and parameters: {'top_k': 47, 'top_p': 0.7128919522462567, 'temperature': 1.1646722717799765, 'repetition_penalty': 1.706737014090148, 'no_repeat_ngram_size': 4, 'max_new_tokens': 191}. Best is trial 1 with value: 0.008118811881188126.


Trial 8: score=-0.3443, length=151, params: top_k=47, top_p=0.71, temp=1.16, rep_pen=1.71, no_rep_ngram=4, max_tokens=191


[I 2025-02-28 07:13:14,903] Trial 9 finished with value: -0.4787096774193549 and parameters: {'top_k': 46, 'top_p': 0.6152062245287181, 'temperature': 0.6396051647990693, 'repetition_penalty': 1.6333604522847631, 'no_repeat_ngram_size': 2, 'max_new_tokens': 202}. Best is trial 1 with value: 0.008118811881188126.


Trial 9: score=-0.4787, length=156, params: top_k=46, top_p=0.62, temp=0.64, rep_pen=1.63, no_rep_ngram=2, max_tokens=202


[I 2025-02-28 07:13:19,785] Trial 10 finished with value: 0.27932960893854747 and parameters: {'top_k': 96, 'top_p': 0.9491087361216234, 'temperature': 0.5554768371287366, 'repetition_penalty': 1.019349081907398, 'no_repeat_ngram_size': 2, 'max_new_tokens': 137}. Best is trial 10 with value: 0.27932960893854747.


Trial 10: score=0.2793, length=86, params: top_k=96, top_p=0.95, temp=0.56, rep_pen=1.02, no_rep_ngram=2, max_tokens=137


[I 2025-02-28 07:13:24,372] Trial 11 finished with value: 0.16149068322981366 and parameters: {'top_k': 99, 'top_p': 0.9458934118113992, 'temperature': 0.5342946308193555, 'repetition_penalty': 1.0880119412761964, 'no_repeat_ngram_size': 2, 'max_new_tokens': 128}. Best is trial 10 with value: 0.27932960893854747.


Trial 11: score=0.1615, length=74, params: top_k=99, top_p=0.95, temp=0.53, rep_pen=1.09, no_rep_ngram=2, max_tokens=128


[I 2025-02-28 07:13:29,008] Trial 12 finished with value: 0.358695652173913 and parameters: {'top_k': 98, 'top_p': 0.9425853226859308, 'temperature': 0.5176776992651415, 'repetition_penalty': 1.0180368146560737, 'no_repeat_ngram_size': 2, 'max_new_tokens': 129}. Best is trial 12 with value: 0.358695652173913.


Trial 12: score=0.3587, length=96, params: top_k=98, top_p=0.94, temp=0.52, rep_pen=1.02, no_rep_ngram=2, max_tokens=129


[I 2025-02-28 07:13:33,630] Trial 13 finished with value: 0.30000000000000004 and parameters: {'top_k': 99, 'top_p': 0.8793759723347118, 'temperature': 0.509327497528851, 'repetition_penalty': 1.018014647205958, 'no_repeat_ngram_size': 2, 'max_new_tokens': 129}. Best is trial 12 with value: 0.358695652173913.


Trial 13: score=0.3000, length=93, params: top_k=99, top_p=0.88, temp=0.51, rep_pen=1.02, no_rep_ngram=2, max_tokens=129


[I 2025-02-28 07:13:39,426] Trial 14 finished with value: 0.19889502762430936 and parameters: {'top_k': 84, 'top_p': 0.876239818347908, 'temperature': 0.7595142482842471, 'repetition_penalty': 1.2180688122281913, 'no_repeat_ngram_size': 3, 'max_new_tokens': 155}. Best is trial 12 with value: 0.358695652173913.


Trial 14: score=0.1989, length=84, params: top_k=84, top_p=0.88, temp=0.76, rep_pen=1.22, no_rep_ngram=3, max_tokens=155


[I 2025-02-28 07:13:44,059] Trial 15 finished with value: 0.24719101123595508 and parameters: {'top_k': 86, 'top_p': 0.8829833959782672, 'temperature': 0.5108221688262243, 'repetition_penalty': 1.202653583269648, 'no_repeat_ngram_size': 5, 'max_new_tokens': 128}. Best is trial 12 with value: 0.358695652173913.


Trial 15: score=0.2472, length=82, params: top_k=86, top_p=0.88, temp=0.51, rep_pen=1.20, no_rep_ngram=5, max_tokens=128


[I 2025-02-28 07:13:51,777] Trial 16 finished with value: -0.19092511013215857 and parameters: {'top_k': 88, 'top_p': 0.8624769053976025, 'temperature': 1.0599374743588286, 'repetition_penalty': 1.4482570880897228, 'no_repeat_ngram_size': 2, 'max_new_tokens': 216}. Best is trial 12 with value: 0.358695652173913.


Trial 16: score=-0.1909, length=131, params: top_k=88, top_p=0.86, temp=1.06, rep_pen=1.45, no_rep_ngram=2, max_tokens=216


[I 2025-02-28 07:13:57,927] Trial 17 finished with value: -0.19636363636363635 and parameters: {'top_k': 57, 'top_p': 0.9096855700056012, 'temperature': 0.5064263967633732, 'repetition_penalty': 1.2298622136919688, 'no_repeat_ngram_size': 3, 'max_new_tokens': 172}. Best is trial 12 with value: 0.358695652173913.


Trial 17: score=-0.1964, length=132, params: top_k=57, top_p=0.91, temp=0.51, rep_pen=1.23, no_rep_ngram=3, max_tokens=172


[I 2025-02-28 07:14:03,252] Trial 18 finished with value: 0.26851063829787236 and parameters: {'top_k': 93, 'top_p': 0.835670247884545, 'temperature': 0.7291077329798742, 'repetition_penalty': 1.0078964118080158, 'no_repeat_ngram_size': 2, 'max_new_tokens': 146}. Best is trial 12 with value: 0.358695652173913.


Trial 18: score=0.2685, length=99, params: top_k=93, top_p=0.84, temp=0.73, rep_pen=1.01, no_rep_ngram=2, max_tokens=146


[I 2025-02-28 07:14:08,677] Trial 19 finished with value: 0.15819209039548024 and parameters: {'top_k': 83, 'top_p': 0.6806388066006412, 'temperature': 0.6157865495104835, 'repetition_penalty': 1.3391549552949282, 'no_repeat_ngram_size': 3, 'max_new_tokens': 147}. Best is trial 12 with value: 0.358695652173913.


Trial 19: score=0.1582, length=88, params: top_k=83, top_p=0.68, temp=0.62, rep_pen=1.34, no_rep_ngram=3, max_tokens=147
Best hyperparameters found: {'top_k': 98, 'top_p': 0.9425853226859308, 'temperature': 0.5176776992651415, 'repetition_penalty': 1.0180368146560737, 'no_repeat_ngram_size': 2, 'max_new_tokens': 129}


In [7]:
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
import fitz  # PyMuPDF
import re
import torch

# Function to sanitize text by removing illegal characters
def sanitize_text(text):
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

# Function to remove duplicate sentences
def remove_duplicate_sentences(text):
    sentences = text.split('. ')
    unique_sentences = list(dict.fromkeys(sentences))  # Preserve order and remove duplicates
    return '. '.join(unique_sentences)

# Function to load the model and tokenizer
@st.cache_resource
def load_model_and_tokenizer(model_name):
    try:
        with st.spinner("Loading model and tokenizer..."):
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(
                model_name, 
                torch_dtype=torch.float16,  # Load in fp16 for efficiency
                device_map="auto"  # Automatically use GPU if available
            )
            if torch.cuda.is_available():
                model.to("cuda")
            st.success("✅ Model loaded successfully.")
            return model, tokenizer
    except Exception as e:
        st.error(f"❌ Error loading model/tokenizer: {e}")
        return None, None

# Function to extract text from a PDF file
def extract_text_from_pdf(uploaded_file):
    text = ""
    try:
        pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        for page in pdf_document:
            text += page.get_text() or ""
        pdf_document.close()
    except Exception as e:
        st.error(f"❌ Error extracting text from PDF: {e}")
    return text.strip()

# Function to chunk the input text
def chunk_text(text, max_length=2000):
    # Split the text into chunks of max_length
    words = text.split()
    chunks = []
    current_chunk = []
    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) >= max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Updated function to generate a summary using the tuned hyperparameters
def generate_summary(input_text, model, tokenizer):
    try:
        prompt = (
            "Below is a legal document. Summarize its key points concisely.\n\n"
            "### Document:\n{input_text}\n\n### Summary:"
        )
        # Use only the first 4096 characters from the chunk
        input_str = prompt.format(input_text=input_text[:4096])
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model_inputs = tokenizer(
            input_str,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=4096
        ).to(device)

        # Use tuned generation parameters
        summary_output = model.generate(
            model_inputs.input_ids,
            max_new_tokens=129,
            do_sample=True,
            top_k=98,
            top_p=0.9426,
            temperature=0.5177,
            repetition_penalty=1.0180,
            no_repeat_ngram_size=2
        )

        full_output = tokenizer.decode(summary_output[0], skip_special_tokens=True)
        marker = "### Summary:"
        summary = full_output.split(marker, 1)[1].strip() if marker in full_output else full_output.strip()
        return sanitize_text(summary)
    except Exception as e:
        st.error(f"❌ Error generating summary: {e}")
        return ""

# Streamlit interface
st.title("Legal Case Summary Generator")

# Update the model name to use the tuned repository
model_name = "coderop12/Empowering_Legal_Summarization"
model, tokenizer = load_model_and_tokenizer(model_name)

# File upload
uploaded_file = st.file_uploader("Upload a case PDF file", type=["pdf"])

if uploaded_file is not None:
    case_text = extract_text_from_pdf(uploaded_file)
    
    st.subheader("Original Case Text")
    st.write(case_text)

    if st.button("Generate Summary"):
        if model is not None and tokenizer is not None:
            sanitized_text = sanitize_text(case_text)
            unique_text = remove_duplicate_sentences(sanitized_text)
            chunks = chunk_text(unique_text)

            # Generate summaries for each chunk using the tuned settings
            summaries = []
            for chunk in chunks:
                summary = generate_summary(chunk, model, tokenizer)
                summaries.append(summary)

            # Combine the summaries
            final_summary = "\n".join(summaries)
            sanitized_final_summary = sanitize_text(final_summary)
            st.subheader("Generated Summary")
            st.write(sanitized_final_summary)
        else:
            st.error("Model or tokenizer not loaded properly.")


2025-02-28 07:20:41.070 
  command:

    streamlit run /home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [8]:
!streamlit run app.py


Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py


In [5]:
!pip install streamlit
!pip install tools
!pip install fitz
!pip install frontend

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting httplib2 (from fitz)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nibabel (from fitz)
  Downloading nibabel-5.3.2-py3-none-any.whl.metadata (9.1 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.9.2-py3-none-any.whl.metadata (6.8 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting importlib-resources>=5.12 (from nibabel->fitz)
  Downloading importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting pydot>=1.2.3 (from nipype->fitz)
  Downloading pydot-3.0.4-py3-none-any.whl.metadata