### Script to generate summaries using chunking based BART method

Assign the dataset and output_path variable according to requirements.  


In [1]:
import bart
import sys
sys.path.insert(0, '../')
from utilities.BART_utilities import *
import utilities.paper_functions as p_fct
import utilities.functions as fct

import pandas as pd
import numpy as np
import os

import time
from tqdm import tqdm


In [2]:
TEXTS_COUNT = 100

extractive_name = 'LexRank'

data_path = f'../evaluate models/output/results_{extractive_name}_dev.csv'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

df = pd.read_csv(data_path)
df_target = pd.read_csv(target_path_csv)

text_txt = df['Text']
gen_txt = df['Generated']
ref_txt = df['Reference']

In [None]:
# Loading Model and tokenizer
tokenizer, bart_model = bart.init_bart()

In [5]:
def evaluate_models(document, ref, short=True):
    start_time = time.time()
    
    summary = bart.BART_summarize(document, tokenizer, bart_model)
    bart_evaluations = fct.evaluations(summary, ref, short)

    end_time = time.time()
    execution_time = end_time - start_time


    bart_evaluations['Execution time'] = execution_time
                    
    return bart_evaluations, summary

In [None]:
summary_gen = []
results = pd.DataFrame()

df_target = pd.read_csv(target_path_csv)

for i in tqdm(range(0, TEXTS_COUNT)):
    r, summary = evaluate_models(gen_txt[i], ref_txt[i], "TXT") 
    
    summary_gen.append(summary) 
    results = pd.concat([results, r], ignore_index=True)

df_results = pd.DataFrame({"Text": text_txt, "Reference": ref_txt, "Generated": summary_gen})
df_results.to_csv("./output/results_Hybrid_Lex_BART_dev.csv", index=False)


In [7]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")

scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([gen], [ref])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Scores moyens pour facts_of_the_case :
{'rouge1': 0.37020796491279695, 'rouge2': 0.05806610093525565, 'rougeL': 0.21624350260562994, 'bert_score': 0.7881969064474106}

Scores moyens pour question :
{'rouge1': 0.41588107689105125, 'rouge2': 0.05931251048403536, 'rougeL': 0.32878381046203214, 'bert_score': 0.7829263806343079}

Scores moyens pour conclusion :
{'rouge1': 0.393694843453856, 'rouge2': 0.0699420574260417, 'rougeL': 0.22504768192217192, 'bert_score': 0.7901122510433197}


In [8]:
metrics = [col for col in results.columns if col in ['rouge1', 'rouge2', 'rougeL', 'bert_score', 'Execution time']]
means = results[metrics].mean()

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

global_row = pd.DataFrame(means).T
global_row.index = ['global']

if 'Execution time' not in df_avg_scores_target.columns:
    df_avg_scores_target = df_avg_scores_target.assign(**{'Execution time': None})

df_score = pd.concat([global_row, df_avg_scores_target], axis=0)

  df_score = pd.concat([global_row, df_avg_scores_target], axis=0)


In [9]:
print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score  Execution time
global             0.441385  0.162161  0.203398    0.822748        9.930518
facts_of_the_case  0.370208  0.058066  0.216244    0.788197             NaN
question           0.415881  0.059313  0.328784    0.782926             NaN
conclusion         0.393695  0.069942  0.225048    0.790112             NaN


In [10]:
df_score.to_csv("./output/scores_Hybrid_Lex_BART_dev.csv")

In [11]:
print("Execution time in total : ", results["Execution time"].sum())

Execution time in total :  993.0517783164978


### Score sur le résumé global pour chaque texte

In [12]:
styled_df = results.style.apply(fct.highlight_min_max, axis=None)

styled_df

Unnamed: 0,rouge1,rouge2,rougeL,bert_score,Execution time
0,0.467662,0.289277,0.268657,0.857896,11.185449
1,0.592506,0.284038,0.316159,0.869969,9.996724
2,0.2525,0.062657,0.1225,0.83165,10.082332
3,0.44702,0.166113,0.218543,0.806154,9.664194
4,0.496042,0.161376,0.195251,0.817917,9.462594
5,0.406162,0.140449,0.187675,0.844469,9.513701
6,0.588235,0.149606,0.294118,0.793299,9.414337
7,0.487805,0.237164,0.226829,0.847137,9.704876
8,0.543353,0.205797,0.283237,0.835641,9.494938
9,0.589595,0.22029,0.271676,0.812589,9.507273
