# Fine Tuning Pegasus - Financial Summarization

Notes:
- Increased training epochs from 1 to 5

In [1]:
# Install the necessary libraries here

!pip install transformers -q
!pip install wandb

!pip install rouge-score
!pip install shap
!pip install sentencepiece

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev



In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:

# os.chdir("D:/Datasets/453_NLP_Final_Project")
#os.environ['TRANSFORMERS_CACHE'] = 'D:/huggingface/transformers'
#os.environ['HF_DATASETS_CACHE'] = 'D:/huggingface/datasets'
#os.environ['HF_METRICS_CACHE'] = 'D:/huggingface/metrics'
#os.environ['HF_MODULE_CACHE'] = 'D:/huggingface/modules'

import numpy as np
import pandas as pd
import torch

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# WandB – Import the wandb library
import wandb
import time
from rouge_score import rouge_scorer
import shap
import sentencepiece

# Project Parameters
filepath = '/content/gdrive/My Drive/NLP_FP/Training_Data_Clean.csv'

save_directory = '/content/gdrive/My Drive/NLP_FP/Pegasus_Finance_2'

predictions_filepath = '/content/gdrive/My Drive/NLP_FP/Pegasus_Finance_2/predictions.csv'

wandb_project_name = "Pegasus_Summarization_Run_2"

!wandb login ed78357f90c301b50743ea99cb9000752f69a842

!nvidia-smi

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
Tue Jun  1 03:03:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---

In [4]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, 
                                                  pad_to_max_length=True,
                                                  return_tensors='pt',
                                                  truncation=True)
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, 
                                                  pad_to_max_length=True,
                                                  return_tensors='pt',
                                                  truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [5]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        
        # Changed this code from lm_labels to labels; lm_labels is deprecated - https://github.com/priya-dwivedi/Deep-Learning/issues/137
        # outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()
        return model

In [6]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
def main():
    # WandB – Initialize a new run
    wandb.init(project=wandb_project_name)

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training  
    config = wandb.config          # Initialize config
    config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
    config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
    config.TRAIN_EPOCHS = 5        # number of epochs to train (default: 10)
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
    config.SEED = 42               # random seed (default: 42)
    config.MAX_LEN = 512
    config.SUMMARY_LEN = 150 

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED) # pytorch random seed
    np.random.seed(config.SEED) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    #tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model_name = "human-centered-summarization/financial-summarization-pegasus"
    tokenizer = PegasusTokenizer.from_pretrained(model_name) 

    # Importing and Pre-Processing the domain data
    # Selecting the needed columns only. 
    # Adding the summarzie text in front of the text. 
    # This is to format the dataset similar to how T5 model was trained for summarization task. 
    df = pd.read_csv(filepath, encoding='latin-1')
    df = df[['text','ctext']]
    df.ctext = 'summarize: ' + df.ctext
    print(df.head())

    
    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
    train_size = 0.8
    split = int(train_size * df.shape[0])
    #train_dataset=df.sample(frac=train_size,random_state = config.SEED)
    train_dataset = df.iloc[:split]
    val_dataset = df.iloc[split:]
    val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    #model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    # Log metrics with wandb
    wandb.watch(model, log="all")
    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
    #    train(epoch, tokenizer, model, device, training_loader, optimizer)
        fine_tuned_model = train(epoch, tokenizer, model, device, training_loader, optimizer)


    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(predictions_filepath)
        print('Output Files generated for review')
    
    wandb.finish()
    return fine_tuned_model

if __name__ == '__main__':
    fine_tuned_model = main()
    fine_tuned_model.save_pretrained(save_directory=save_directory)

In [None]:
# End of Model Fine Tuning

### Review Output

In [8]:
# View a sample of the predictions data
training_data = pd.read_csv(filepath)
predictions_sample = pd.read_csv(predictions_filepath)
predictions_sample.head(10)
predictions_sample.shape

Unnamed: 0.1,Unnamed: 0,Generated Text,Actual Text
0,0,FirstGroup is the largest transport operator i...,02 FirstGroup Annual Report and Accounts 2013 ...
1,1,Report and Accounts for the year ended 31 Dece...,2 FirstGroup Annual Report and Accounts 2014 C...
2,2,translation impact of US dollar reduced turnov...,04 FKI plc Chairmans statement Gordon Page Cha...
3,3,"EFD, FLOmerics and MicXpress products now avai...",CHAIRMANS STATEMENT During 2007 there was good...
4,4,Directors Report Chief Executives Review Busin...,2 Energetix Group plc Group Financial Statemen...
5,5,Financial statements for the year ended 31 Dec...,2 E n e r ge t i x Group plc Group Financial S...
6,6,"Group has achieved revenue growth, reduced cos...",5 Business review In my first annual report as...
7,7,Company’s performance in 2003 was very encoura...,4 Focus Solutions Group plc Annual Report & Ac...
8,8,Focus Solutions has come a long way since floa...,4 Focus Solutions Group plc Annual Report & Ac...
9,9,Group again achieves record sales and profits....,Introduction I am pleased to be able to report...


(306, 3)

In [None]:
def shap_summary_values(model, tokenizer, text):
    text_list = []
    text_list.append(text)
    explainer = shap.Explainer(model, tokenizer)
    shap_values = explainer(text_list)
    display(shap.plots.text(shap_values))
    return

def view_summary_comparisons(i, predictions_sample=predictions_sample):
    summary1 = predictions_sample["Generated Text"].iloc[i]
    print(summary1, '\n')
    print(len(predictions_sample["Generated Text"].iloc[i].split()), '\n')
    summary2 = predictions_sample["Actual Text"].iloc[i]
    print(summary2, '\n')
    print(len(predictions_sample["Actual Text"].iloc[i].split()))
    return

def get_training_data(i, tokens, training_data=training_data):
    print(training_data.loc[i])
    print(training_data.loc[i, "ctext"])
    print("\n")
    len(training_data.loc[i, "ctext"].split())
    print("\n")
    text = ' '.join(training_data.loc[i, "ctext"].split()[0:tokens])
    print("\n")
    len(text.split())
    return text

In [None]:
fine_tuned_model = PegasusForConditionalGeneration.from_pretrained(save_directory).to(device)
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name) 
print(fine_tuned_model)

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0): PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementw

#### Shap Values

In [None]:
view_summary_comparisons(0)
text1 = get_training_data(1222, tokens=200, training_data=training_data)

FirstGroup is the largest transport operator in the UK and North America. 

12 

02 FirstGroup Annual Report and Accounts 2013 Chairmans statement As we continue to manage in a climate of uncertainty, we have taken significant steps this year to enhance our flexibility and strengthen the Group for the future. In a sector that is a key enabler of economic development, the Groups diverse portfolio offers an attractive platform for sustainable growth. During the year we continued to take action to mitigate the effects of prolonged economic weakness and to place the business on a firmer footing to continue to invest for the future and deliver improved growth and returns. The Group has grown rapidly over the last 20 years through a combination of acquisition, organic growth and contract wins, and we have established a broad-based portfolio of market leading transport 

128
Unnamed: 0                                                          1222
Original_Filename_x                           

In [None]:
shap_summary_values(fine_tuned_model, tokenizer, text1)

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))



Unnamed: 0_level_0,04 First Group Annual Report and Accounts 2013 Chief Executives,"strategic review Tim O Toole Chief Executive Our services help to create strong ,",vibrant and sustainable local economies and our opportunity is to be the provider of choice for our customers and communities .,"We are the largest transport operator in the UK and North America and each day ,","every one of our 120,000 employees works hard to deliver vitally important services for our passengers .","During the last year more than 2.5 billion passengers relied on us to get to work , to education , to visit family and friends and for much more .","In May of this year , Martin Gilbert announced his intention to stand down as Chairman , once a successor has been identified .","On behalf of the Board and our employees , I would like to pay tribute to Martin and thank him for his outstanding contribution to the Company .","As Chairman and a founder , his vision and drive have led the transformation of the Group ,",and under his stewardship the business has grown to become one of the worlds leading transport operators .,"Our opportunity Our objective is to provide sustainable , integrated transport services that are safe , reliable and meet the needs of the customers and the communities we"
First,4.284,2.169,-0.206,-0.082,0.076,-0.138,-0.318,0.121,-0.136,0.079,0.263
Group,2.388,1.557,0.64,0.776,0.317,0.357,1.045,0.979,0.489,0.63,0.363
is,0.082,0.812,1.233,0.64,0.455,0.625,-0.391,-1.214,-0.214,-0.118,1.128
the,0.952,0.507,0.305,0.308,0.357,0.957,-0.488,-0.617,0.049,0.18,0.044
largest,0.461,0.238,-0.198,0.017,0.153,0.307,0.136,-0.152,0.304,0.396,0.77
transport,2.084,0.444,0.575,2.91,1.424,0.982,0.326,0.4,0.583,0.893,1.154
operator,0.637,0.283,0.219,2.576,0.952,0.408,-0.301,-0.333,-0.133,0.077,0.316
in,0.134,0.03,-0.05,0.391,0.152,0.0,-0.018,-0.027,0.016,-0.0,-0.036
the,0.584,0.259,0.027,1.077,0.436,-0.213,-0.011,0.264,-0.098,0.191,-0.117
UK,0.302,0.299,0.006,1.129,0.314,0.243,0.075,0.102,-0.102,0.242,0.01
and,0.209,-0.019,0.084,2.522,1.005,-0.039,-0.387,0.167,-0.18,-0.085,-0.258
North,0.283,0.572,0.369,5.633,2.081,0.087,0.676,0.682,0.204,0.265,0.045
America,-0.115,0.063,-0.301,1.475,0.556,-0.037,-0.214,-0.019,-0.052,-0.165,-0.72
.,0.233,0.327,0.214,0.223,0.266,0.592,0.431,0.163,0.228,0.168,0.157


None

#### ROUGE Score Calculation

In [9]:
# Rouge Score Calculation

def rouge_scores(gen_summary_list, actual_summary_list, metric='recall'):
    rouge1_scores = []
    rougeL_scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    for i in range(0, len(gen_summary_list)):
        scores = scorer.score(actual_summary_list[i], gen_summary_list[i])
        if metric == 'recall':
            rouge1_scores.append(scores['rouge1'][1])
            rougeL_scores.append(scores['rougeL'][1])
        elif metric == 'precision':
            rouge1_scores.append(scores['rouge1'][0])
            rougeL_scores.append(scores['rougeL'][0])
        elif metric == 'fmeasure':
            rouge1_scores.append(scores['rouge1'][2])
            rougeL_scores.append(scores['rougeL'][2])     
    print("Average Rouge-1", str(metric), ":", round(np.mean(rouge1_scores), 2))
    print("Average Rouge-L", str(metric), ":", round(np.mean(rougeL_scores), 2))
    return

In [10]:
rouge_scores(list(predictions_sample["Generated Text"]), list(predictions_sample["Actual Text"]), metric='recall')
rouge_scores(list(predictions_sample["Generated Text"]), list(predictions_sample["Actual Text"]), metric='precision')
rouge_scores(list(predictions_sample["Generated Text"]), list(predictions_sample["Actual Text"]), metric='fmeasure')

Average Rouge-1 recall : 0.07
Average Rouge-L recall : 0.05
Average Rouge-1 precision : 0.49
Average Rouge-L precision : 0.36
Average Rouge-1 fmeasure : 0.12
Average Rouge-L fmeasure : 0.09
