In [1]:
# %pip install bm25s
# %pip install spacy
# %pip install -U 'spacy[cuda12x]'
# %pip install rouge_score
# %pip install pysbd

In [2]:
import sys
sys.path.insert(0, '../')
import utilities.functions as fct
import time
import os
import pandas as pd
from tqdm import tqdm

# BM25

In [3]:
def evaluate_models(document, paragraph_target, short=True):
    results = pd.DataFrame()
    summaries = []
    
    start_time = time.time()
    
    # Segmentation des phrases du document source
    sentences = fct.sent_segmentation(document, method="pySBD")
    
    # Résumé des phrases
    query = fct.select_query(document)
    summary = fct.bb25LegalSum(sentences, model, query)
    
    # Évaluation de la qualité du résumé à l'aide de la métrique ROUGE et BERT
    bb25_evaluation = fct.evaluations(" ".join(summary), paragraph_target, short)

    end_time = time.time()
    execution_time = end_time - start_time

    # Prepare results for this method
    bb25_evaluation['Execution time'] = execution_time
            
    # Append results
    results = pd.concat([results, bb25_evaluation], ignore_index=True)
    summaries.append("\n".join(summary))
        
    return results, summaries

### BM25 with pySBD on 100 cleaned documents

In [4]:
test_path_txt = '../SCOTUS_data/text_dev'
test_path_sum = '../SCOTUS_data/summary_dev'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

TEXTS_COUNT = 100

summary_ref = []
texts = []

for file_name in tqdm(os.listdir(test_path_sum)[:TEXTS_COUNT]):
    with open(os.path.join(test_path_sum, file_name), 'r', encoding="utf-8") as f:
        text = f.read()
        summary_ref.append(text)
    text = open(os.path.join(test_path_txt, file_name), 'r', encoding="utf-8").read()
    texts.append(text)

100%|██████████| 100/100 [00:00<00:00, 1061.44it/s]


In [5]:
model = "nlpaueb/legal-bert-base-uncased"

summary_gen = []
results = pd.DataFrame()
df_target = pd.read_csv(target_path_csv)

for i in tqdm(range(len(summary_gen), TEXTS_COUNT)):
    r, summary = evaluate_models(texts[i], summary_ref[i], True) 
    
    summary_gen.append(summary) 
    results = pd.concat([results, r], ignore_index=True)

df_results = pd.DataFrame({"Text": texts, "Reference": summary_ref, "Generated": summary_gen})
df_results.to_csv("./output/results_BM25_dev.csv", index=False)


100%|██████████| 100/100 [44:49<00:00, 26.90s/it]


In [6]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")

scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i][0]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([gen], [ref])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Scores moyens pour facts_of_the_case :
{'rouge1': 0.33016237207558996, 'rouge2': 0.04267705148176444, 'rougeL': 0.19335845020310014, 'bert_score': 0.7879923415184021}

Scores moyens pour question :
{'rouge1': 0.37029951352052015, 'rouge2': 0.038176314126858436, 'rougeL': 0.29863460104689166, 'bert_score': 0.7839836913347245}

Scores moyens pour conclusion :
{'rouge1': 0.33871303577699163, 'rouge2': 0.04292023173699055, 'rougeL': 0.19378330163356694, 'bert_score': 0.7876042199134826}


In [7]:
metrics = [col for col in results.columns if col in ['rouge1', 'rouge2', 'rougeL', 'bert_score', 'Execution time']]
means = results[metrics].mean()

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

global_row = pd.DataFrame(means).T
global_row.index = ['global']

if 'Execution time' not in df_avg_scores_target.columns:
    df_avg_scores_target = df_avg_scores_target.assign(**{'Execution time': None})

df_score = pd.concat([global_row, df_avg_scores_target], axis=0)

  df_score = pd.concat([global_row, df_avg_scores_target], axis=0)


In [8]:
print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score  Execution time
global             0.395154  0.136824  0.196767    0.821098       26.897771
facts_of_the_case  0.330162  0.042677  0.193358    0.787992             NaN
question           0.370300  0.038176  0.298635    0.783984             NaN
conclusion         0.338713  0.042920  0.193783    0.787604             NaN


In [9]:
df_score.to_csv("./output/scores_BM25_dev.csv")

### Score sur le résumé global pour chaque texte

In [None]:
styled_df = results.style.apply(fct.highlight_min_max, axis=0)

styled_df