# Fine Tuning Pegasus - Legal Summarization

In [1]:
# Install the necessary libraries here

!pip install transformers -q
!pip install wandb

!pip install rouge-score
!pip install shap
!pip install sentencepiece

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

[K     |████████████████████████████████| 2.3MB 7.3MB/s 
[K     |████████████████████████████████| 901kB 42.2MB/s 
[K     |████████████████████████████████| 3.3MB 51.2MB/s 
[?25hCollecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/6c/48/b199e2b3b341ac842108c5db4956091dd75d961cfa77aceb033e99cac20f/wandb-0.10.31-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.2MB/s 
[?25hCollecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/1c/4a/a54b254f67d8f4052338d54ebe90126f200693440a93ef76d254d581e3ec/sentry_sdk-1.1.0-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 46.5MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:

# os.chdir("D:/Datasets/453_NLP_Final_Project")
#os.environ['TRANSFORMERS_CACHE'] = 'D:/huggingface/transformers'
#os.environ['HF_DATASETS_CACHE'] = 'D:/huggingface/datasets'
#os.environ['HF_METRICS_CACHE'] = 'D:/huggingface/metrics'
#os.environ['HF_MODULE_CACHE'] = 'D:/huggingface/modules'

import numpy as np
import pandas as pd
import torch

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# WandB – Import the wandb library
import wandb
import time
from rouge_score import rouge_scorer
import shap
import sentencepiece

# Project Parameters
filepath = '/content/gdrive/My Drive/NLP_FP/Training_Data_Clean.csv'

save_directory = '/content/gdrive/My Drive/NLP_FP/Pegasus_Legal_2'

predictions_filepath = '/content/gdrive/My Drive/NLP_FP/Pegasus_Legal_2/predictions.csv'

wandb_project_name = "Pegasus_Legal_Summarization_Run_2"

!wandb login ed78357f90c301b50743ea99cb9000752f69a842

!nvidia-smi

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
Tue Jun  1 03:14:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, 
                                                  pad_to_max_length=True,
                                                  return_tensors='pt',
                                                  truncation=True)
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, 
                                                  pad_to_max_length=True,
                                                  return_tensors='pt',
                                                  truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        
        # Changed this code from lm_labels to labels; lm_labels is deprecated - https://github.com/priya-dwivedi/Deep-Learning/issues/137
        # outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()
        return model

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=250, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
def main():
    # WandB – Initialize a new run
    wandb.init(project=wandb_project_name)

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training  
    config = wandb.config          # Initialize config
    config.TRAIN_BATCH_SIZE = 1    # input batch size for training (default: 64)
    config.VALID_BATCH_SIZE = 1    # input batch size for testing (default: 1000)
    config.TRAIN_EPOCHS = 1        # number of epochs to train (default: 10)
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
    config.SEED = 42               # random seed (default: 42)
    config.MAX_LEN = 512
    config.SUMMARY_LEN = 250 

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED) # pytorch random seed
    np.random.seed(config.SEED) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")

    # Importing and Pre-Processing the domain data
    # Selecting the needed columns only. 
    # Adding the summarzie text in front of the text. 
    # This is to format the dataset similar to how T5 model was trained for summarization task. 
    df = pd.read_csv(filepath, encoding='latin-1')
    df = df[['text','ctext']]
    df.ctext = 'summarize: ' + df.ctext
    print(df.head())

    
    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
    train_size = 0.8
    split = int(train_size * df.shape[0])
    #train_dataset=df.sample(frac=train_size,random_state = config.SEED)
    train_dataset = df.iloc[:split]
    val_dataset = df.iloc[split:]
    val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    # Log metrics with wandb
    wandb.watch(model, log="all")
    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
    #    train(epoch, tokenizer, model, device, training_loader, optimizer)
        fine_tuned_model = train(epoch, tokenizer, model, device, training_loader, optimizer)


    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(predictions_filepath)
        print('Output Files generated for review')
    
    wandb.finish()
    return fine_tuned_model

if __name__ == '__main__':
    fine_tuned_model = main()
    fine_tuned_model.save_pretrained(save_directory=save_directory)

[34m[1mwandb[0m: Currently logged in as: [33mogk01[0m (use `wandb login --relogin` to force relogin)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1328.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1341.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1508.0, style=ProgressStyle(description…


                                                text                                              ctext
0  3 Chairmans statement Oxford Universitys Said ...  summarize:  4MEGGITT PLC REPORT AND ACCOUNTS 2...
1  Chairmans statement Growing the Group Its been...  summarize:  6 MEGGITT PLC          REPORT AND ...
2  Chairmans statement I am pleased to report our...  summarize:  Chief Executives reviewDivestments...
3  GROUP OVERVIEW Chairmans statement 2014 has be...  summarize:  STRATEGIC REPORT Chief Executive O...
4  2 Metalrax Group PLC Engineering specialists I...  summarize:  CHIEF EXECUTIVES REVIEW6 Metalrax ...
FULL Dataset: (1528, 2)
TRAIN Dataset: (1222, 2)
TEST Dataset: (306, 2)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2283841687.0, style=ProgressStyle(descr…


Initiating Fine-Tuning for the model on our dataset


The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).


Epoch: 0, Loss:  9.35388469696045
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100
Completed 200
Completed 300
Output Files generated for review


VBox(children=(Label(value=' 0.07MB of 0.07MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training Loss,9.35388
_runtime,1814.0
_timestamp,1622000287.0
_step,1.0


0,1
Training Loss,▁
_runtime,▁█
_timestamp,▁█
_step,▁█


In [None]:
# End of Model Fine Tuning

### Review Output

In [4]:
training_data = pd.read_csv(filepath)
predictions_sample = pd.read_csv(predictions_filepath)
predictions_sample.head(10)
predictions_sample.shape

Unnamed: 0.1,Unnamed: 0,Generated Text,Actual Text
0,0,FirstGroup's 2013 Annual Report and Accounts f...,02 FirstGroup Annual Report and Accounts 2013 ...
1,1,The annual report and accounts for the year en...,2 FirstGroup Annual Report and Accounts 2014 C...
2,2,A further weakening of the US dollar reduced t...,04 FKI plc Chairmans statement Gordon Page Cha...
3,3,In 2007 Flomerics invested in expanding its sa...,CHAIRMANS STATEMENT During 2007 there was good...
4,4,The Board of Directors of Energetix Group plc ...,2 Energetix Group plc Group Financial Statemen...
5,5,The Board is pleased with the successful trans...,2 E n e r ge t i x Group plc Group Financial S...
6,6,The Group has achieved both revenue growth and...,5 Business review In my first annual report as...
7,7,The 2004 Financial Sector Technology Awards na...,4 Focus Solutions Group plc Annual Report & Ac...
8,8,The audited financial results for the year end...,4 Focus Solutions Group plc Annual Report & Ac...
9,9,The group again achieved record sales andprofi...,Introduction I am pleased to be able to report...


(306, 3)

In [None]:
def shap_summary_values(model, tokenizer, text):
    text_list = []
    text_list.append(text)
    explainer = shap.Explainer(model, tokenizer)
    shap_values = explainer(text_list)
    display(shap.plots.text(shap_values))
    return

def view_summary_comparisons(i, predictions_sample=predictions_sample):
    summary1 = predictions_sample["Generated Text"].iloc[i]
    print(summary1, '\n')
    print(len(predictions_sample["Generated Text"].iloc[i].split()), '\n')
    summary2 = predictions_sample["Actual Text"].iloc[i]
    print(summary2, '\n')
    print(len(predictions_sample["Actual Text"].iloc[i].split()))
    return

def get_training_data(i, tokens, training_data=training_data):
    print(training_data.loc[i])
    print(training_data.loc[i, "ctext"])
    print("\n")
    len(training_data.loc[i, "ctext"].split())
    print("\n")
    text = ' '.join(training_data.loc[i, "ctext"].split()[0:tokens])
    print("\n")
    len(text.split())
    return text

In [None]:
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(save_directory).to(device)
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
print(fine_tuned_model)

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0): PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, element

#### Shap Values

In [None]:
view_summary_comparisons(0)
text1 = get_training_data(1222, tokens=300, training_data=training_data)

FirstGroup's 2013 Annual Report and Accounts for the year ending 31 December 2013 published today. The full text of the report is available on the company's website at www.firstgroup.com. The annual report also includes a summary of the key strategic developments that have taken place since the last report. The CEO highlights the significant progress made in the Group's transformation plan, which has led to the creation of a world-class transport operator with a diverse portfolio of assets across North America and Europe. In May 2013, the Board announced plans to raise approximately 615 million pounds through a rights issue. This will remove the constraints of the Company's current balance sheet and provide the flexibility to continue its transformation plans and invest to create sustainable, long term value. We aim to increase Group revenue (excluding UK Rail) at a faster rate than the economies we serve, improve margins in First Student and UK Bus1 to double digit levels, and achieve

In [None]:
shap_summary_values(fine_tuned_model, tokenizer, text1)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
text2 = get_training_data(1222, tokens=400, training_data=training_data)
shap_summary_values(fine_tuned_model, tokenizer, text2)

Output hidden; open in https://colab.research.google.com to view.

#### ROUGE Score Calculation

In [5]:
# Rouge Score Calculation

def rouge_scores(gen_summary_list, actual_summary_list, metric='recall'):
    rouge1_scores = []
    rougeL_scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    for i in range(0, len(gen_summary_list)):
        scores = scorer.score(actual_summary_list[i], gen_summary_list[i])
        if metric == 'recall':
            rouge1_scores.append(scores['rouge1'][1])
            rougeL_scores.append(scores['rougeL'][1])
        elif metric == 'precision':
            rouge1_scores.append(scores['rouge1'][0])
            rougeL_scores.append(scores['rougeL'][0])
        elif metric == 'fmeasure':
            rouge1_scores.append(scores['rouge1'][2])
            rougeL_scores.append(scores['rougeL'][2])     
    print("Average Rouge-1", str(metric), ":", round(np.mean(rouge1_scores), 2))
    print("Average Rouge-L", str(metric), ":", round(np.mean(rougeL_scores), 2))
    return

In [6]:
rouge_scores(list(predictions_sample["Generated Text"]), list(predictions_sample["Actual Text"]), metric='recall')
rouge_scores(list(predictions_sample["Generated Text"]), list(predictions_sample["Actual Text"]), metric='precision')
rouge_scores(list(predictions_sample["Generated Text"]), list(predictions_sample["Actual Text"]), metric='fmeasure')

Average Rouge-1 recall : 0.31
Average Rouge-L recall : 0.16
Average Rouge-1 precision : 0.37
Average Rouge-L precision : 0.19
Average Rouge-1 fmeasure : 0.33
Average Rouge-L fmeasure : 0.17
