### Script to generate summaries using chunking based Pegasus approach

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../')
import utilities.paper_functions as p_fct
import utilities.functions as fct
import os
import torch
import time
from tqdm import tqdm

In [2]:
TEXTS_COUNT = 100

test_path_txt = '../SCOTUS_data/text_dev'
test_path_sum = '../SCOTUS_data/summary_dev'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Test sur : {device}")

Test sur : cuda


In [4]:
# Loading Model and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")  
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus").to(device)


In [5]:
def summerize(text, max_len, min_len):
    '''
    Function to generate summary using Pegasus
    input:  nested_sentences - chunks
            max_l - Maximum length
            min_l - Minimum length
    output: document summary
    '''
    try:
        input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=512,truncation=True).to(device)
        summary_ids = model.generate(input_tokenized,
                                          num_beams=9,
                                          length_penalty=0.1,
                                          min_length=min_len,
                                          max_length=max_len,
                                    )
        summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
        return summary
    except:
        return ""

In [6]:
def summerize_doc(nested_sentences, p):
    '''
    Function to generate summary using chunking based Pegasus
    input:  nested_sentences - chunks
            p - Number of words in summaries per word in the document
    output: document summary
    '''
    device = 'cuda:1'
    result = []
    for nested in nested_sentences:
        l = int(p * len(nested.split(" ")))
        max_len = l
        min_len = l-5
        result.append(summerize(nested, max_len, min_len))
    return result

In [7]:
def legal_pegasus_summarize(text, req_len=512):
    input_len = len(text.split(" "))

    nested = p_fct.nest_sentences(text, req_len)
    p = float(req_len / input_len)
    abs_summ = summerize_doc(nested, p)
    abs_summ = " ".join(abs_summ)

    if len(abs_summ.split(" ")) > req_len:
        abs_summ = abs_summ.split(" ")
        abs_summ = abs_summ[:req_len]
        abs_summ = " ".join(abs_summ)

    return abs_summ

In [8]:
def evaluate_models(document, ref, f1_only=True):
    start_time = time.time()
    
    summary = legal_pegasus_summarize(document)
    pegasus_evaluations = fct.evaluations(summary, ref, f1_only)

    end_time = time.time()
    execution_time = end_time - start_time


    pegasus_evaluations['Execution time'] = execution_time
                    
    return pegasus_evaluations, summary

In [9]:
summary_ref = []
texts = []

for file_name in tqdm(os.listdir(test_path_sum)[:TEXTS_COUNT]):
    with open(os.path.join(test_path_sum, file_name), 'r', encoding="utf-8") as f:
        text = f.read()
        summary_ref.append(text)
    text = open(os.path.join(test_path_txt, file_name), 'r', encoding="utf-8").read()
    texts.append(text)

100%|██████████| 100/100 [00:01<00:00, 56.17it/s]


In [10]:
summary_gen = []
results = pd.DataFrame()

df_target = pd.read_csv(target_path_csv)

for i in tqdm(range(0, TEXTS_COUNT)):
    r, summary = evaluate_models(texts[i], summary_ref[i], "TXT") 
    
    summary_gen.append(summary) 
    results = pd.concat([results, r], ignore_index=True)

df_results = pd.DataFrame({"Text": texts, "Reference": summary_ref, "Generated": summary_gen})
df_results.to_csv("./output/results_Legal_Pegasus_dev.csv", index=False)


100%|██████████| 100/100 [24:46<00:00, 14.86s/it]


In [11]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")

scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([gen], [ref])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Scores moyens pour facts_of_the_case :
{'rouge1': 0.35532440422949274, 'rouge2': 0.07770203265340687, 'rougeL': 0.25453679167874393, 'bert_score': 0.7981234347820282}

Scores moyens pour question :
{'rouge1': 0.3843140310863196, 'rouge2': 0.06257560207873972, 'rougeL': 0.3256732333058618, 'bert_score': 0.786503079533577}

Scores moyens pour conclusion :
{'rouge1': 0.34976077160436747, 'rouge2': 0.06723163064811506, 'rougeL': 0.2473124048583567, 'bert_score': 0.7913803547620774}


In [None]:
metrics = [col for col in results.columns if col in ['rouge1', 'rouge2', 'rougeL', 'bert_score', 'Execution time']]
means = results[metrics].mean()

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

global_row = pd.DataFrame(means).T
global_row.index = ['global']

if 'Execution time' not in df_avg_scores_target.columns:
    df_avg_scores_target = df_avg_scores_target.assign(**{'Execution time': None})

df_score = pd.concat([global_row, df_avg_scores_target], axis=0)

In [16]:
print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score  Execution time
global             0.384479  0.136188  0.211285    0.822709       14.860497
facts_of_the_case  0.355324  0.077702  0.254537    0.798123             NaN
question           0.384314  0.062576  0.325673    0.786503             NaN
conclusion         0.349761  0.067232  0.247312    0.791380             NaN


In [15]:
df_score.to_csv("./output/scores_Legal_Pegasus_dev.csv")

### Score sur le résumé global pour chaque texte

In [13]:
styled_df = results.style.apply(fct.highlight_min_max, axis=None)

styled_df

Unnamed: 0,rouge1,rouge2,rougeL,bert_score,Execution time
0,0.448513,0.277523,0.302059,0.858676,15.526874
1,0.594406,0.294393,0.270396,0.863777,13.294659
2,0.231527,0.08642,0.123153,0.834347,13.953717
3,0.473945,0.174129,0.230769,0.841214,13.758287
4,0.340153,0.084615,0.179028,0.809219,14.495766
5,0.23913,0.06267,0.157609,0.797973,15.836517
6,0.492683,0.171149,0.27561,0.823035,14.610379
7,0.277778,0.047745,0.190476,0.783009,15.812673
8,0.477612,0.199501,0.248756,0.839019,14.682599
9,0.64433,0.229974,0.319588,0.831018,15.341376
