### Script to generate summaries using chunking based BART method

Assign the dataset and output_path variable according to requirements.  


In [None]:
import bart
import sys
sys.path.insert(0, '../')
from utilities.BART_utilities import *
import utilities.functions as fct

import pandas as pd

import time
from tqdm import tqdm


In [2]:
TEXTS_COUNT = 100

extractive_name = 'BM25'

data_path = f'../evaluate models/output/results_{extractive_name}_dev.csv'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

df = pd.read_csv(data_path)
df_target = pd.read_csv(target_path_csv)

text_txt = df['Text']
gen_txt = df['Generated']
ref_txt = df['Reference']

In [None]:
# Loading Model and tokenizer
tokenizer, bart_model = bart.init_bart()

In [5]:
def evaluate_models(document, ref, short=True):
    start_time = time.time()
    
    summary = bart.BART_summarize(document, tokenizer, bart_model)
    bart_evaluations = fct.evaluations(summary, ref, short)

    end_time = time.time()
    execution_time = end_time - start_time


    bart_evaluations['Execution time'] = execution_time
                    
    return bart_evaluations, summary

In [None]:
summary_gen = []
results = pd.DataFrame()

df_target = pd.read_csv(target_path_csv)

for i in tqdm(range(0, TEXTS_COUNT)):
    r, summary = evaluate_models(gen_txt[i], ref_txt[i], "TXT") 
    
    summary_gen.append(summary) 
    results = pd.concat([results, r], ignore_index=True)

df_results = pd.DataFrame({"Text": text_txt, "Reference": ref_txt, "Generated": summary_gen})
df_results.to_csv(f"./output/results_Hybrid_{extractive_name}_BART_dev.csv", index=False)


In [7]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")

scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([gen], [ref])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Scores moyens pour facts_of_the_case :
{'rouge1': 0.31532520810984754, 'rouge2': 0.04532309870890379, 'rougeL': 0.18576773018152953, 'bert_score': 0.7795057898759842}

Scores moyens pour question :
{'rouge1': 0.3736832068932117, 'rouge2': 0.04445293580864708, 'rougeL': 0.2959053749157244, 'bert_score': 0.7760913699865342}

Scores moyens pour conclusion :
{'rouge1': 0.32878845105964233, 'rouge2': 0.046902734425753184, 'rougeL': 0.18883808626870127, 'bert_score': 0.779408050775528}


In [None]:
metrics = [col for col in results.columns if col in ['rouge1', 'rouge2', 'rougeL', 'bert_score', 'Execution time']]
means = results[metrics].mean()

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

global_row = pd.DataFrame(means).T
global_row.index = ['global']

if 'Execution time' not in df_avg_scores_target.columns:
    df_avg_scores_target = df_avg_scores_target.assign(**{'Execution time': None})

df_score = pd.concat([global_row, df_avg_scores_target], axis=0)

In [9]:
print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score  Execution time
global             0.361414  0.116111  0.175532    0.806600       11.357748
facts_of_the_case  0.315325  0.045323  0.185768    0.779506             NaN
question           0.373683  0.044453  0.295905    0.776091             NaN
conclusion         0.328788  0.046903  0.188838    0.779408             NaN


In [10]:
df_score.to_csv(f"./output/scores_Hybrid_{extractive_name}_BART_dev.csv")

In [11]:
print("Execution time in total : ", results["Execution time"].sum())

Execution time in total :  1135.7747793197632


### Score sur le résumé global pour chaque texte

In [12]:
styled_df = results.style.apply(fct.highlight_min_max, axis=None)

styled_df

Unnamed: 0,rouge1,rouge2,rougeL,bert_score,Execution time
0,0.44757,0.210256,0.232737,0.846688,15.921417
1,0.539409,0.212346,0.229064,0.841628,11.059104
2,0.233553,0.066007,0.131579,0.803474,11.772801
3,0.414286,0.111111,0.228571,0.794689,10.447952
4,0.502732,0.241096,0.314208,0.840611,10.904805
5,0.351536,0.130137,0.174061,0.818617,10.439716
6,0.403974,0.086379,0.201987,0.791358,11.413492
7,0.285714,0.055375,0.12987,0.759382,10.988787
8,0.407285,0.126246,0.162252,0.797152,11.264571
9,0.321888,0.107759,0.167382,0.779567,10.881045
