In [1]:
# %pip install bm25s
# %pip install spacy
# %pip install -U 'spacy[cuda12x]'
# %pip install rouge_score
# %pip install pysbd

In [2]:
import functions as fct
import time
import os
import pandas as pd
from tqdm import tqdm

# BM25

In [3]:
# # Définition des chemins vers les fichiers JSON d'entraînement et de développement
# train_path = 'SCOTUS/train.json'
# dev_path = 'SCOTUS/dev.json'

# # Ouverture du fichier d'entraînement
# train = fct.open_file(train_path, "json")

# # Récupération du document source du premier élément de l'ensemble d'entraînement
# # document = train[0]["raw_source"]
# document = train

# # Récupération des éléments de la cible (faits, question, conclusion)
# paragraph_target = (
#     train[0]['raw_target']['facts_of_the_case'] +
#     train[0]['raw_target']['question'] +
#     train[0]['raw_target']['conclusion']
# )

# train_path = 'data_txt_save/train_0.txt'
# train = fct.open_file(train_path, "txt")

# document = train


# # Segmentation des phrases du document source
# sentences = fct.sent_segmentation(document, method='custom_spacy')

# # Résumé des phrases en utilisant le modèle BERT
# summary = fct.bb25LegalSum(sentences, "bert-base-uncased", 5)

# # Évaluation de la qualité du résumé à l'aide de la métrique ROUGE
# bb25_rouge = fct.rouge_evaluations(" ".join(summary), paragraph_target)

In [4]:
# bb25_rouge

In [5]:
# Définition des chemins vers les fichiers JSON d'entraînement et de développement
train_path_json = 'SCOTUS/train.json'
dev_path_json = 'SCOTUS/dev.json'

# Ouverture du fichier d'entraînement JSON
train_json = fct.open_file(train_path_json, "json")

text_number = 0

# Récupération du document source et des éléments de la cible (faits, question, conclusion)
document_json = train_json[text_number]["raw_source"]
paragraph_target_json = (
    train_json[text_number]['raw_target']['facts_of_the_case'] +
    train_json[text_number]['raw_target']['question'] +
    train_json[text_number]['raw_target']['conclusion']
)

# Définir le chemin du fichier texte à traiter
text_path = f'data_txt_save/text/train_{text_number}.txt'

# Ouverture du fichier d'entraînement TXT
document_txt = fct.open_file(text_path, "txt")

# Liste des méthodes de segmentation et des modèles à tester
methods = ['nltk', 'spacy', 'pySBD', 'custom_spacy']  # Méthodes de segmentation à tester

model = "bert-base-uncased"  # Modèle de résumé à utiliser

def save_txt(text, folder_name, file_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    file_path = os.path.join(folder_name, file_name)

    with open(file_path, 'w') as file:
        file.write(text)

# Fonction pour évaluer les modèles
def evaluate_models(document, paragraph_target, file_type):
    results = pd.DataFrame()
    
    for method in methods:
        start_time = time.time()

        # Segmentation des phrases du document source
        sentences = fct.sent_segmentation(document, method=method)
        
        # Résumé des phrases
        query = fct.select_query(document)
        summary = fct.bb25LegalSum(sentences, model, query)
        
        # Évaluation de la qualité du résumé à l'aide de la métrique ROUGE
        bb25_rouge = fct.rouge_evaluations(" ".join(summary), paragraph_target)

        end_time = time.time()
        execution_time = end_time - start_time
                
        result = bb25_rouge.set_index(' ').T
        result.insert(0, 'Method', file_type+'_'+method)
        result['Execution time'] = execution_time
        
        summary_conc = "\n".join(summary)
        save_txt(summary_conc, f"summaries/{method}/", f"summary_train_{text_number}.txt") #save train_n summary
        
        results = pd.concat([results, result], ignore_index=True)
        
    return results, summary


Test sur 1 texte

In [6]:
results = pd.DataFrame()

# Évaluation des modèles pour le fichier JSON
r, _ = evaluate_models(document_json, paragraph_target_json, "JSON")
results = pd.concat([results, r], ignore_index=True)

result, summary = evaluate_models(document_txt, paragraph_target_json, "TXT")  # Utilise le même paragraphe cible
results = pd.concat([results, result], ignore_index=True)

In [7]:
import pandas as pd

def highlight_min_max(df):
    styles = pd.DataFrame('', index=df.index, columns=df.columns)

    # Appliquer le style pour les colonnes 'Precision', 'Recall', 'F1-Score'
    for col in ['rouge1', 'rouge2', 'rougeL']:
        # Top 3 maximums et minimums
        top_3_max = df[col].nlargest(3)
        top_3_min = df[col].nsmallest(3)

        # Appliquer le dégradé rouge pour les min
        for i in df.index:
            if df[col].iloc[i] in top_3_min.values:
                rank = top_3_min.rank()[top_3_min == df[col].iloc[i]].values[0]
                alpha = 1 - (rank - 1) / 3  
                styles.loc[i, col]= f'background-color: rgba(200, 50, 50, {alpha});'

        # Appliquer le dégradé vert pour les max
        for i in df.index:
            if df[col].iloc[i] in top_3_max.values:
                rank = top_3_max.rank(ascending=False)[top_3_max == df[col].iloc[i]].values[0]
                alpha = 1 - (rank - 1) / 3
                styles.loc[i, col]= f'background-color: rgba(50, 200, 50, {alpha});'

    # Pour la colonne 'Execution time', inverser les couleurs (max en rouge, min en vert)
    col = 'Execution time'
    top_3_max = df[col].nlargest(3)
    top_3_min = df[col].nsmallest(3)

    # Appliquer le dégradé vert pour les min
    for i in df.index:
        if df[col].iloc[i] in top_3_min.values:
            rank = top_3_min.rank()[top_3_min == df[col].iloc[i]].values[0]
            alpha = 1 - (rank - 1) / 3 
            styles.loc[i, col]= f'background-color: rgba(50, 200, 50, {alpha});'

    # Appliquer le dégradé rouge pour les max
    for i in df.index:
        if df[col].iloc[i] in top_3_max.values:
            rank = top_3_max.rank(ascending=False)[top_3_max == df[col].iloc[i]].values[0]
            alpha = 1 - (rank - 1) / 3 
            styles.loc[i, col]= f'background-color: rgba(200, 50, 50, {alpha});'
            
    return styles


# Choix segmenteur

In [8]:
df = pd.DataFrame(results)

styled_df = df.style.apply(highlight_min_max, axis=None)

styled_df

Unnamed: 0,Method,rouge1,rouge2,rougeL,Execution time
0,JSON_nltk,0.229056,0.107219,0.111347,46.867264
1,JSON_spacy,0.26513,0.081159,0.161383,180.383457
2,JSON_pySBD,0.174194,0.077922,0.116129,179.45298
3,JSON_custom_spacy,0.254545,0.057441,0.145455,157.501788
4,TXT_nltk,0.537477,0.234862,0.26691,14.211957
5,TXT_spacy,0.404762,0.172249,0.257143,16.704587
6,TXT_pySBD,0.5,0.216028,0.215278,13.212178
7,TXT_custom_spacy,0.503049,0.195719,0.246951,16.505785


We try pySBD on 100 cleaned documents

In [9]:
results = pd.DataFrame()

methods = ['pySBD']

for i in tqdm(range(0, 100)):

    document_json = train_json[i]["raw_source"]
    paragraph_target_json = (
        train_json[i]['raw_target']['facts_of_the_case'] +
        train_json[i]['raw_target']['question'] +
        train_json[i]['raw_target']['conclusion']
    )

    text_path = f'data_txt_save/text/train_{i}.txt'
    document_txt = fct.open_file(text_path, "txt")
    
    r, _ = evaluate_models(document_txt, paragraph_target_json, "TXT")  
        
    results = pd.concat([results, r], ignore_index=True)
    
means = results[['rouge1', 'rouge2', 'rougeL', 'Execution time']].mean()

print("Means :")
print(means)

100%|██████████| 100/100 [1:00:38<00:00, 36.38s/it] 

Means :
 
rouge1             0.397978
rouge2             0.158284
rougeL             0.216640
Execution time    36.347295
dtype: float64





In [None]:
import pandas as pd

def highlight_min_max(df):
    styles = pd.DataFrame('', index=df.index, columns=df.columns)

    # Appliquer le style pour les colonnes 'Precision', 'Recall', 'F1-Score'
    for col in ['rouge1', 'rouge2', 'rougeL']:
        # Top 3 maximums et minimums
        top_3_max = df[col].nlargest(3)
        top_3_min = df[col].nsmallest(3)

        # Appliquer le dégradé rouge pour les min
        for i in df.index:
            if df[col].iloc[i] in top_3_min.values:
                rank = top_3_min.rank()[top_3_min == df[col].iloc[i]].values[0]
                alpha = 1 - (rank - 1) / 3  
                styles.loc[i, col]= f'background-color: rgba(200, 50, 50, {alpha});'

        # Appliquer le dégradé vert pour les max
        for i in df.index:
            if df[col].iloc[i] in top_3_max.values:
                rank = top_3_max.rank(ascending=False)[top_3_max == df[col].iloc[i]].values[0]
                alpha = 1 - (rank - 1) / 3
                styles.loc[i, col]= f'background-color: rgba(50, 200, 50, {alpha});'

    # Pour la colonne 'Execution time', inverser les couleurs (max en rouge, min en vert)
    col = 'Execution time'
    top_3_max = df[col].nlargest(3)
    top_3_min = df[col].nsmallest(3)

    # Appliquer le dégradé vert pour les min
    for i in df.index:
        if df[col].iloc[i] in top_3_min.values:
            rank = top_3_min.rank()[top_3_min == df[col].iloc[i]].values[0]
            alpha = 1 - (rank - 1) / 3 
            styles.loc[i, col]= f'background-color: rgba(50, 200, 50, {alpha});'

    # Appliquer le dégradé rouge pour les max
    for i in df.index:
        if df[col].iloc[i] in top_3_max.values:
            rank = top_3_max.rank(ascending=False)[top_3_max == df[col].iloc[i]].values[0]
            alpha = 1 - (rank - 1) / 3 
            styles.loc[i, col]= f'background-color: rgba(200, 50, 50, {alpha});'
    return styles

styled_df = results.style.apply(highlight_min_max, axis=None)

styled_df

In [11]:
i = 59
document_json = train_json[i]["raw_source"]
paragraph_target_json = (
    train_json[i]['raw_target']['facts_of_the_case'] +
    train_json[i]['raw_target']['question'] +
    train_json[i]['raw_target']['conclusion']
)

text_path = f'data_txt_save/text/train_{i}.txt'
document_txt = fct.open_file(text_path, "txt")

methods = ['pySBD']

r, summary = evaluate_models(document_txt, paragraph_target_json, "TXT")  
    
print(summary)

['We hold that, as a general rule, when a litigant’s recovery constitutes income, the litigant’s income includes the portion of the recovery paid to the attorney as a contingent fee. ', 'Six Courts of Appeals have held the entire litigation recovery, including the portion paid to an attorney as a contingent fee, is income to the plaintiff. ', 'In the other case under review,  Banaitis  v.  Commissioner , 340 F. 3d 1074 (2003), the Court of Appeals for the Ninth Circuit held that the portion of the recovery paid to the attorney as a contingent fee is excluded from the plaintiff’s gross income if state law gives the plaintiff’s attorney a special property interest in the fee, but not otherwise. ', 'Sometimes, as when the plaintiff seeks only injunctive relief, or when the statute caps plaintiffs’ recoveries, or when for other reasons damages are substantially less than attorney’s fees, court-awarded attorney’s fees can exceed a plaintiff’s monetary recovery. ', 'The Commissioner maintain