# Import libraries

In [None]:
import pandas as pd
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import re
import ssl

NLTK resources downloaded successfully!


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shouv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shouv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Fix SSL certificate issues if any

In [None]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download NLTK resources
print("Downloading NLTK resources...")
nltk.download('punkt')
nltk.download('stopwords')
print("NLTK resources downloaded!")

# Download required NLTK resources 

In [16]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Set up file paths and load resources

In [18]:
input_file = 'legal_text_classification.csv' 
output_file = 'summarized_cases.csv'

nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# Load the dataset

In [19]:
df = pd.read_csv(input_file)

# Verify the dataset has the required column
if 'case_text' not in df.columns:
    raise ValueError("CSV file must contain a 'case_text' column.")

# Display the first few rows to understand the data
df.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [20]:
def custom_extractive_summary(text, ratio=0.2):
    # Handle empty or invalid text
    if not isinstance(text, str) or not text.strip():
        return ""
    
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    # Handle cases with very few sentences
    if len(sentences) <= 3:
        return " ".join(sentences)
    
    # Clean sentences (remove stopwords)
    clean_sentences = [" ".join([word for word in sentence.lower().split() if word not in stop_words]) 
                      for sentence in sentences]
    
    # Create TF-IDF vectorizer and compute sentence vectors
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(clean_sentences)
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(sentence_vectors)
    
    # Calculate sentence scores based on similarity
    sentence_scores = np.sum(similarity_matrix, axis=1)
    
    # Determine number of sentences for the summary
    num_sentences = max(1, int(len(sentences) * ratio))
    
    # Get top sentence indices (sorted by score)
    ranked_sentences = sorted(((sentence_scores[i], i) for i in range(len(sentences))), reverse=True)
    top_sentence_indices = [ranked_sentences[i][1] for i in range(num_sentences)]
    
    # Sort indices to preserve original sentence order
    top_sentence_indices.sort()
    
    # Generate summary
    summary = " ".join([sentences[i] for i in top_sentence_indices])
    
    return summary

# Apply the custom summarization to each case text

In [21]:
df['summary'] = df['case_text'].apply(lambda x: custom_extractive_summary(str(x)))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Shouv/nltk_data'
    - 'C:\\Users\\Shouv\\miniconda3\\envs\\legal_summarizer\\nltk_data'
    - 'C:\\Users\\Shouv\\miniconda3\\envs\\legal_summarizer\\share\\nltk_data'
    - 'C:\\Users\\Shouv\\miniconda3\\envs\\legal_summarizer\\lib\\nltk_data'
    - 'C:\\Users\\Shouv\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


# Preview some summaries

In [None]:
for i in range(min(3, len(df))):
    print(f"Case {i+1}:")
    print(f"Original length: {len(str(df['case_text'].iloc[i]))} characters")
    print(f"Summary length: {len(df['summary'].iloc[i])} characters")
    print(f"Summary: {df['summary'].iloc[i][:200]}...")
    print("-" * 80)

# Save the results

In [None]:
df.to_csv(output_file, index=False)
print(f"Output saved to {output_file}")