In [1]:
# %pip install bm25s
# %pip install spacy
# %pip install -U 'spacy[cuda12x]'
# %pip install rouge_score
# %pip install pysbd

In [1]:
import sys
sys.path.insert(0, '../')
import utilities.functions as fct
import time
import os
import pandas as pd
from tqdm import tqdm

# BM25

In [2]:
# Définition des chemins vers les fichiers JSON d'entraînement et de développement
train_path_json = 'SCOTUS/train.json'

# Ouverture du fichier d'entraînement JSON
train_json = fct.open_file(train_path_json, "json")

def save_txt(text, folder_name, file_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    file_path = os.path.join(folder_name, file_name)

    with open(file_path, 'w') as file:
        file.write(text)

# Fonction pour évaluer les modèles
def evaluate_models(document, paragraph_target, file_type, only_f1=True):
    results = pd.DataFrame()
    summaries = []
    
    for method in methods:
        start_time = time.time()
        
        # Segmentation des phrases du document source
        sentences = fct.sent_segmentation(document, method=method)
        
        # Résumé des phrases
        query = fct.select_query(document)
        summary = fct.bb25LegalSum(sentences, model, query)
        
        # Évaluation de la qualité du résumé à l'aide de la métrique ROUGE et BERT
        bb25_evaluation = fct.evaluations(" ".join(summary), paragraph_target, only_f1)

        end_time = time.time()
        execution_time = end_time - start_time

        # Prepare results for this method
        bb25_evaluation.insert(0, 'Method', file_type + '_' + method)
        bb25_evaluation['Execution time'] = execution_time
                
        # Append results
        results = pd.concat([results, bb25_evaluation], ignore_index=True)
        summaries.append("\n".join(summary))
        
    return results, summaries

Test sur 1 texte

In [4]:
text_number = 0

# Récupération du document source et des éléments de la cible (faits, question, conclusion)
document_json = train_json[text_number]["raw_source"]
paragraph_target_json = (
    train_json[text_number]['raw_target']['facts_of_the_case'] +
    train_json[text_number]['raw_target']['question'] +
    train_json[text_number]['raw_target']['conclusion']
)

# Fichier texte à traiter
text_path = f'SCOTUS_data/text/train_{text_number}.txt'
document_txt = fct.open_file(text_path, "txt")

# Liste des méthodes de segmentation et des modèles à tester
methods = ['nltk', 'spacy', 'pySBD', 'custom_spacy']  # Méthodes de segmentation à tester
model = "bert-base-uncased"  # Modèle de résumé à utiliser
only_f1 = True

results = pd.DataFrame()

# Évaluation des modèles pour le texte html
r, summaries = evaluate_models(document_json, paragraph_target_json, "JSON", only_f1)
results = pd.concat([results, r], ignore_index=True)
# Évaluation des modèles pour le texte nettoyé
result, summaries = evaluate_models(document_txt, paragraph_target_json, "TXT", only_f1)
results = pd.concat([results, result], ignore_index=True)

# Choix segmenteur

In [5]:
df = pd.DataFrame(results)

styled_df = df.style.apply(fct.highlight_min_max, axis=None, only_f1=only_f1)

styled_df

Unnamed: 0,Method,rouge1,rouge2,rougeL,bert_score,Execution time
0,JSON_nltk,0.485893,0.191824,0.194357,0.77412,16.312103
1,JSON_spacy,0.289086,0.130564,0.159292,0.814223,45.178729
2,JSON_pySBD,0.316667,0.106145,0.183333,0.800038,48.740111
3,JSON_custom_spacy,0.289086,0.130564,0.159292,0.813568,48.841163
4,TXT_nltk,0.505495,0.206612,0.225275,0.835671,8.814383
5,TXT_spacy,0.507586,0.207469,0.212414,0.831892,9.995616
6,TXT_pySBD,0.506829,0.207002,0.248862,0.831253,8.786521
7,TXT_custom_spacy,0.507163,0.212644,0.234957,0.841034,10.182218


JSON_ : summary made on the html files  
TXT_ : summary made on the cleaned files