# Import Libraries
Import all required libraries for text processing, including NLTK, spaCy, or other NLP libraries.

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shouv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shouv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Shouv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shouv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Text Preprocessing Functions
Create functions to clean and preprocess text, such as tokenization, stopword removal, and sentence segmentation.

In [2]:
# Text Preprocessing Functions

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string
from rouge import Rouge
import os

# Function to clean and preprocess text
def clean_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text
    return ""

def preprocess_text(text):
    if not isinstance(text, str) or not text:
        return []
    cleaned = clean_text(text)
    words = word_tokenize(cleaned)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return filtered_words  # Return words instead of reconstructed sentences


# Extractive Summary Generation
Summarization function using extractive summarization with TF-IDF

In [3]:
# Summarization function using extractive summarization with TF-IDF
def summarize_text(text, max_sentences=5):
    if not isinstance(text, str) or not text:
        return ""
        
    # Get original sentences
    original_sentences = sent_tokenize(text)
    if len(original_sentences) <= max_sentences:
        return text
        
    # Preprocess sentences (retain non-empty sentences)
    preprocessed_sentences = []
    valid_indices = []  # Track indices of non-empty preprocessed sentences
    for idx, sentence in enumerate(original_sentences):
        processed = preprocess_text(sentence)
        if processed:  # Skip empty sentences
            preprocessed_sentences.append(' '.join(processed))
            valid_indices.append(idx)
    
    if len(preprocessed_sentences) <= max_sentences:
        return text
        
    # TF-IDF and scoring
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
    except ValueError:
        return text  # Handle empty vocabulary
        
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    sentence_scores = np.sum(similarity_matrix, axis=1)
    
    # Select top sentences from valid_indices
    top_indices = np.argsort(sentence_scores)[-max_sentences:]
    selected_original_indices = [valid_indices[i] for i in top_indices]
    selected_original_indices = sorted(selected_original_indices)
    
    summary = ' '.join([original_sentences[i] for i in selected_original_indices])
    return summary
    
# Evaluation function
def evaluate_summary(reference_summary, generated_summary):
    """Evaluate summary using ROUGE scores"""
    rouge = Rouge()
    
    if not reference_summary or not generated_summary:
        return {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
        
    try:
        scores = rouge.get_scores(generated_summary, reference_summary)[0]
        return scores
    except Exception:
        return {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}


# Load the dataset

In [4]:
try:
    legal_data = pd.read_csv('legal_text_classification.csv')
    print(f"Dataset loaded successfully with {legal_data.shape[0]} rows")
    # Force 'case_text' as the text column
    if 'case_text' not in legal_data.columns:
        raise ValueError("Column 'case_text' not found in dataset")
    text_column = 'case_text'
except Exception as e:
    print(f"Error: {e}")
    exit()
    
# Check if the dataset has a 'case_text' column
if 'case_text' not in legal_data.columns:
    print("Error: 'case_text' column not found in the dataset")
    # Handle error or exit
else:
    text_column = 'case_text'

Dataset loaded successfully with 24985 rows


# Apply summarization to the legal texts

In [5]:
print("Summarizing legal texts...")
legal_data['summary'] = legal_data[text_column].apply(lambda x: summarize_text(str(x), max_sentences=3))

Summarizing legal texts...


In [6]:
# Calculate summary statistics

In [7]:
legal_data['original_length'] = legal_data[text_column].apply(lambda x: len(str(x)))
legal_data['summary_length'] = legal_data['summary'].apply(len)
legal_data['compression_ratio'] = legal_data['summary_length'] / legal_data['original_length']


In [8]:
print("Summarization complete!")
print(f"Average compression ratio: {legal_data['compression_ratio'].mean():.2f}")

Summarization complete!
Average compression ratio: 0.56


In [9]:
# Export the results to CSV

In [10]:
output_file = 'legal_text_summaries.csv'
legal_data.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to legal_text_summaries.csv


In [11]:
# Display sample summaries

In [12]:
print("\n--- Sample Summaries ---")
sample_indices = np.random.choice(legal_data.index, min(3, len(legal_data)), replace=False)
for idx in sample_indices:
    print(f"\nOriginal text (excerpt): {legal_data.loc[idx, text_column][:200]}...")
    print(f"Summary: {legal_data.loc[idx, 'summary']}")
    print(f"Compression ratio: {legal_data.loc[idx, 'compression_ratio']:.2f}")


--- Sample Summaries ---

Original text (excerpt): Conveniently, general principles relevant to the Court granting leave to amend pleadings were summarised recently in the judgment of Edmonds J in SPI Spirits (Cyprus) Ltd v Diageo Australia Ltd (No 4)...
Summary: Conveniently, general principles relevant to the Court granting leave to amend pleadings were summarised recently in the judgment of Edmonds J in SPI Spirits (Cyprus) Ltd v Diageo Australia Ltd (No 4) [2007] FCA 1035. In relation to this issue I adopt the following statements of his Honour: " [14] The starting point is that all of such amendments should be made as are necessary to enable the real questions in controversy between the parties to be decided: Queensland v J L Holdings Pty Ltd [1997] HCA 1 ; (1997) 189 CLR 146 ; Dresna Pty Ltd v Misu Nominees Pty Ltd [2003] FCA 1537. [15] The overriding concerns should be to ensure that all matters in issue upon which the parties seek adjudication are determined in the proceeding 