<a href="https://colab.research.google.com/github/VickkiMars/NLP_Mastery/blob/main/text_summarization_on_legal_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets &> /dev/null
!pip install transformers &> /dev/null

In [7]:
import os, sys, logging, nltk, numpy as np, tensorflow as tf, pandas as pd

In [3]:
from tensorflow import keras
tf.get_logger().setLevel(logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [155]:
from datasets import load_dataset

ds = load_dataset("AjayMukundS/Legal_Text_Summarization-llama2", split='train')

In [156]:
print(ds)

Dataset({
    features: ['judgement', 'dataset_name', 'summary', 'text'],
    num_rows: 7773
})


In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-large")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



In [157]:
ds = ds.train_test_split(train_size=0.3, test_size=0.3)

In [158]:
ds

DatasetDict({
    train: Dataset({
        features: ['judgement', 'dataset_name', 'summary', 'text'],
        num_rows: 2331
    })
    test: Dataset({
        features: ['judgement', 'dataset_name', 'summary', 'text'],
        num_rows: 2332
    })
})

In [187]:
ds = ds.remove_columns(['dataset_name', 'judgement'])

In [188]:
columns = ['text', 'summary']

In [189]:
def find_empty(column):
  index = []
  for data in ds['train'][column]:
    if data == '' or data == " ":
      index.append(ds['train'][column].index(data))

  return index

In [190]:
for col in columns:
  print(find_empty(col))

[]
[]


In [191]:
# Assuming 'df' contains your Hugging Face Dataset and 'train' is the split you want to modify:
df_train = ds['train']

# Get all indices *except* the one you want to remove:
indices_to_keep = [i for i in range(len(df_train)) if i != 241 and i != 1085]

# Use 'select' to create a new Dataset without the row at index 2177:
ds['train'] = df_train.select(indices_to_keep)

In [192]:
def proc(x):
  for char in ["<s>","[INST]","[/INST]","</s>","\n" ]:
    x = x.replace(char, "")
  return x


In [193]:
def find_length(column):
  max, min = 0, 9e9
  ind1, ind2  = 0, 0
  for data in ds['train'][column]:
    if len(proc(data).split(' ')) > max:
      ind1 = ds['train'][column].index(data)
      max = len(proc(data).split(' '))

  for data in ds['train'][column]:
    if len(proc(data).split(' ')) < min:
      ind2 = ds['train'][column].index(data)
      min = len(proc(data).split(' '))

  return f"Minimum Word Length for '{column}': {min}\nMaximum Word Length for '{column}': {max}\nIndex of Minimum Word Length for '{column}': {ind1}\nIndex of Maximum Word Length for '{column}': {ind2}\n"

In [194]:
for col in columns:
  print(find_length(col))

Minimum Word Length for 'text': 245
Maximum Word Length for 'text': 84690
Index of Minimum Word Length for 'text': 1110
Index of Maximum Word Length for 'text': 347

Minimum Word Length for 'summary': 41
Maximum Word Length for 'summary': 14251
Index of Minimum Word Length for 'summary': 854
Index of Maximum Word Length for 'summary': 264



In [None]:
MAX_INPUT_LENGTH = 84690
MIN_TARGET_LENGTH = 41
MAX_TARGET_LENGTH = 14251
BATCH_SIZE = 4
LEARNING_RATE = 2e-5
EPOCHS = 5

MODEL_NAME = "t5-large"

In [196]:
def preprocess_text(input_text):
  inputs = [proc(text) for text in input_text['text']]
  model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(input_text['summary'], max_length=MAX_TARGET_LENGTH, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [197]:
tokenized_df = ds.map(preprocess_text, batched=True)

Map:   0%|          | 0/2326 [00:00<?, ? examples/s]



Map:   0%|          | 0/2332 [00:00<?, ? examples/s]

In [185]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [186]:

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [200]:
train_dataset = tokenized_df["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_df["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_df["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [201]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

In [203]:
!pip install keras-hub &> /dev/null

In [207]:
!pip install rouge_score &> /dev/null

In [213]:
!pip install evaluate --upgrade  # Upgrade 'datasets' to the latest version

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [215]:
import keras_hub
import evaluate
rouge_l = evaluate.load('rouge')


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

# For now we will use our test set as our validation_data
model.fit(
    train_dataset, validation_data=test_dataset, epochs=EPOCHS, callbacks=callbacks
)



Epoch 1/5


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(
    raw_datasets["test"][0]["judgement"],
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)