# Loading Dataset from Kaggle

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'legal-text-classification-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3873453%2F6723483%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240521%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240521T171823Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D50bf3ff26863dfa1126bfb7865725ad59402f07be0f3ef136b683594b08781b8920f4f34aba4235adfa0c1297fac2c05b6e6c196ec968795f461a56d59e942d1a1b4f105fc45cfd64ff054ff004c0b1e2cca2f39956ec7eeb88bbb8a4c879e3d84131a15f98917cc54bc16299d42f32a298c18c65aac66c3d8eef368d9ca08c65aa62c6d441d5593df7041f4a0b2e80b4fabcfd2b432e0dca2690c811d6d7416e2a41a9d2a92c669036fc1e567e37ea70890def6e792bb686ca3cb43a6f9f98a2394a820446902a737e765d827e144072a69039f8190e52b61e97398d315dcb7e5a17636a0574bb4e7e3803ac2ceaabf4c5c75d5b1d086b6c5496a48475873c4'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading legal-text-classification-dataset, 15646328 bytes compressed
Downloaded and uncompressed: legal-text-classification-dataset
Data source import complete.


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/kaggle/input/legal-text-classification-dataset/legal_text_classification.csv',on_bad_lines="skip",engine='python')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24984 entries, 0 to 24983
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       24984 non-null  object
 1   case_outcome  24984 non-null  object
 2   case_title    24984 non-null  object
 3   case_text     24808 non-null  object
dtypes: object(4)
memory usage: 780.9+ KB


In [54]:
title, case_text = df.iloc[3][['case_title', 'case_text']]

In [55]:
print("Title:")
print(title)
print("Case Text:")
print(case_text)

Title:
Dais Studio Pty Ltd v Bullett Creative Pty Ltd [2008] FCA 42
Case Text:
The general principles governing the exercise of the discretion to award indemnity costs after rejection by an unsuccessful party of a so called Calderbank letter were set out in the judgment of the Full Court in Black v Lipovac [1998] FCA 699 ; (1998) 217 ALR 386. In summary those principles are: 1. Mere refusal of a "Calderbank offer" does not itself warrant an order for indemnity costs. In this connection it may be noted that Jessup J in Dais Studio Pty Ltd v Bullet Creative Pty Ltd [2008] FCA 42 said that (at [6]): if the rejection of such an offer is to ground a claim for indemnity costs, it must be by reason of some circumstance other than that the offer happened to comply with the Calderbank principle. 2. To obtain an order for indemnity costs the offeror must show that the refusal to accept it was unreasonable. 3. The reasonableness of the conduct of the offeree is to be viewed in the light of the ci

# Preprocessing

In [7]:
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text: str):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
pre_text = preprocess_text(case_text)

In [47]:
# Processed
print(pre_text)

assumption without foundation refused interlocutory relief 10 february 2005 see 2005 fca 79 leave appeal judgment refused hely j 3 may 2005 see 2005 fca 541


In [48]:
# Original
print(case_text)

The assumption is without foundation. I refused interlocutory relief on 10 February 2005; see [2005] FCA 79. Leave to appeal from my judgment was refused by Hely J on 3 May 2005; see [2005] FCA 541.


#Checking entities

In [44]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(pre_text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
# print("Entities:", entities)
for ent in entities:
  print('Entity:', ent[1]+',', 'Content:', ent[0])

Entity: DATE, Content: 10 february 2005
Entity: DATE, Content: 2005
Entity: CARDINAL, Content: 79
Entity: DATE, Content: 3 may 2005
Entity: DATE, Content: 2005
Entity: CARDINAL, Content: 541


#Using LLama3 70b 8192 model for summarization

In [11]:
!pip install groq



In [12]:
groq_api = "use_your_api_key_from_groq_cloud"

In [13]:
from groq import Groq

client = Groq(
    api_key=groq_api
)
def get_summary(text):
  completion = client.chat.completions.create(
      model="llama3-70b-8192",
      messages=[
          {"role": "system", "content": "You are a legal assistant that summarizes legal documents. Use this format to write your response: Summary: (summary content)"},
          {"role": "user", "content": f"Summarize the following legal text. Note that the summary should be concise but includes all the necessary infromation like import entities e.t.c:\n\n{text}\n\nSummary:"},
      ],
      temperature=1,
      max_tokens=1024,
      top_p=1,
      stream=True,
      stop=None,
  )
  result = ""
  for chunk in completion:
      result += chunk.choices[0].delta.content or ""
  result = result[len('Summary') + 2:]
  return result
print(get_summary(pre_text))


The Federal Court of Australia (FCA) denied interlocutory relief on February 10, 2005 (2005 FCA 79). Subsequently, leave to appeal the judgment was refused by Hely J on May 3, 2005 (2005 FCA 541).


In [14]:
df.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


#Summarizing the first 100 entries in dataset
Its because the summarization takes time and there are nearly 25k items in the dataset which is way large.

In [15]:
df = df.dropna().iloc[:100]

In [16]:
#Drop the unnecessary columns
df = df.drop(['case_id', 'case_outcome'],axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 100
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   case_title  100 non-null    object
 1   case_text   100 non-null    object
dtypes: object(2)
memory usage: 2.3+ KB


In [18]:
df['Preprocessed'] = df['case_text'].apply(preprocess_text)
df.head()

Unnamed: 0,case_title,case_text,Preprocessed
0,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,ordinarily discretion exercised cost follow ev...
1,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,general principle governing exercise discretio...
2,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,ordinarily discretion exercised cost follow ev...
3,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,general principle governing exercise discretio...
4,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,preceding general principle inform exercise di...


#Get summaries in Summary column

In [19]:
df['Summary'] = df['Preprocessed'].apply(get_summary)

#Printing the top 5 summaries

In [25]:
for _ in range(5):
  summary = df.iloc[_]['Summary']
  print(summary)
  print()

In awarding indemnity costs, the court exercises discretion, departing from the normal practice, and requires a special or unusual feature in the case. This principle is supported by Alpine Hardwood Aust Pty Ltd v Hardy Pty Ltd (2002) FCA 224, (2002) 190 ALR 121, where Weinberg J cited Colgate Palmolive Co v Cussons Pty Ltd (1993) 46 FCR 225, 233, per Sheppard J.

The general principle governing the exercise of discretion to award indemnity costs in a rejected Calderbank letter is outlined. According to the Full Court in Black v Lipovac (1998) FCA 699, (1998) 217 ALR 386, a mere refusal of a Calderbank offer does not warrant an order for indemnity costs. The High Court in Jessup v Dais Studio Pty Ltd v Bullet Creative Pty Ltd (2008) FCA 42 emphasized that rejection of an offer is not, in itself, a ground for indemnity costs. To obtain an order for indemnity costs, the offeror must show that the refusal to accept the offer was unreasonable in the circumstances. The reasonableness of the

#Save the summaries in a csv file

In [53]:
df['Summary'].to_csv('/kaggle/working/summary.csv',index=False)