### Script to generate summaries using chunking based BART method

Assign the dataset and output_path variable according to requirements.  


In [1]:
import bart
import sys
sys.path.insert(0, '../')
from utilities.BART_utilities import *
import utilities.functions as fct

import pandas as pd

import time
from tqdm import tqdm


In [2]:
TEXTS_COUNT = 100

extractive_name = 'Luhn'

data_path = f'../evaluate models/output/results_{extractive_name}_dev.csv'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

df = pd.read_csv(data_path)
df_target = pd.read_csv(target_path_csv)

text_txt = df['Text']
gen_txt = df['Generated']
ref_txt = df['Reference']

In [None]:
# Loading Model and tokenizer
tokenizer, bart_model = bart.init_bart()

In [4]:
def evaluate_models(document, ref, short=True):
    start_time = time.time()
    
    summary = bart.BART_summarize(document, tokenizer, bart_model)
    bart_evaluations = fct.evaluations(summary, ref, short)

    end_time = time.time()
    execution_time = end_time - start_time


    bart_evaluations['Execution time'] = execution_time
                    
    return bart_evaluations, summary

In [None]:
summary_gen = []
results = pd.DataFrame()

df_target = pd.read_csv(target_path_csv)

for i in tqdm(range(0, TEXTS_COUNT)):
    r, summary = evaluate_models(gen_txt[i], ref_txt[i], "TXT") 
    
    summary_gen.append(summary) 
    results = pd.concat([results, r], ignore_index=True)

df_results = pd.DataFrame({"Text": text_txt, "Reference": ref_txt, "Generated": summary_gen})
df_results.to_csv(f"./output/results_Hybrid_{extractive_name}_BART_dev.csv", index=False)


In [6]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")

scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([gen], [ref])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Scores moyens pour facts_of_the_case :
{'rouge1': 0.38564778791607784, 'rouge2': 0.057386151369229955, 'rougeL': 0.21994708194878076, 'bert_score': 0.7892847824096679}

Scores moyens pour question :
{'rouge1': 0.44414578131153554, 'rouge2': 0.06111045696758175, 'rougeL': 0.3493056949927741, 'bert_score': 0.7856728154420852}

Scores moyens pour conclusion :
{'rouge1': 0.40271724959984234, 'rouge2': 0.0652287378976085, 'rougeL': 0.2250423059518968, 'bert_score': 0.7915278041362762}


In [None]:
metrics = [col for col in results.columns if col in ['rouge1', 'rouge2', 'rougeL', 'bert_score', 'Execution time']]
means = results[metrics].mean()

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

global_row = pd.DataFrame(means).T
global_row.index = ['global']

if 'Execution time' not in df_avg_scores_target.columns:
    df_avg_scores_target = df_avg_scores_target.assign(**{'Execution time': None})

df_score = pd.concat([global_row, df_avg_scores_target], axis=0)

In [8]:
print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score  Execution time
global             0.401559  0.138550  0.184157    0.822910       10.918485
facts_of_the_case  0.385648  0.057386  0.219947    0.789285             NaN
question           0.444146  0.061110  0.349306    0.785673             NaN
conclusion         0.402717  0.065229  0.225042    0.791528             NaN


In [12]:
df_score.to_csv(f"./output/scores_Hybrid_{extractive_name}_BART_dev.csv")

In [10]:
print("Execution time in total : ", results["Execution time"].sum())

Execution time in total :  1091.8484597206116


### Score sur le résumé global pour chaque texte

In [11]:
styled_df = results.style.apply(fct.highlight_min_max, axis=None)

styled_df

Unnamed: 0,rouge1,rouge2,rougeL,bert_score,Execution time
0,0.421546,0.166667,0.17096,0.835012,12.174277
1,0.275862,0.093426,0.144828,0.788067,10.314229
2,0.431217,0.183024,0.219577,0.836508,10.343457
3,0.48538,0.14956,0.204678,0.802259,10.752008
4,0.366569,0.085294,0.13783,0.817724,10.638265
5,0.282913,0.067416,0.148459,0.818337,11.067551
6,0.348974,0.123529,0.155425,0.807232,10.654406
7,0.439891,0.126027,0.172131,0.833844,10.697122
8,0.49763,0.168646,0.206161,0.827007,10.801412
9,0.461165,0.211679,0.196602,0.836707,10.747047
